quantum-memory-graph 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quantum_memory_graph-1.2.0/quantum_memory_graph.egg-info → quantum_memory_graph-1.2.1}/PKG-INFO +21 -5
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/README.md +10 -4
- quantum_memory_graph-1.2.1/benchmarks/run_longmemeval_cvar_v2.py +272 -0
- quantum_memory_graph-1.2.1/benchmarks/run_longmemeval_staged.py +355 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/__init__.py +1 -1
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1/quantum_memory_graph.egg-info}/PKG-INFO +21 -5
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph.egg-info/SOURCES.txt +2 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/setup.cfg +12 -1
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/LICENSE +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/__init__.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/data_collector.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/fast_longmemeval.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/generate_scenarios.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v2.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v3.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v4.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v5.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v6.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v7.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/memcombine.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/run_final.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/run_full_benchmark.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/run_full_benchmark_v2.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/run_longmemeval_chunked_staged.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/pyproject.toml +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/__main__.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/api.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/graph.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/pce_optimizer.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/pipeline.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/recency.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/subgraph_optimizer.py +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph.egg-info/dependency_links.txt +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph.egg-info/requires.txt +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph.egg-info/top_level.txt +0 -0
- {quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/tests/test_full_pipeline.py +0 -0
{quantum_memory_graph-1.2.0/quantum_memory_graph.egg-info → quantum_memory_graph-1.2.1}/PKG-INFO
RENAMED
|
@@ -1,15 +1,25 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: quantum-memory-graph
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Quantum-optimized knowledge graph memory for AI agents. Relationship-aware subgraph selection via QAOA.
|
|
5
5
|
Home-page: https://github.com/Dustin-a11y/quantum-memory-graph
|
|
6
6
|
Author: Coinkong (Chef's Attraction)
|
|
7
7
|
License: MIT
|
|
8
|
+
Project-URL: Source Code, https://github.com/Dustin-a11y/quantum-memory-graph
|
|
9
|
+
Project-URL: Issue Tracker, https://github.com/Dustin-a11y/quantum-memory-graph/issues
|
|
10
|
+
Project-URL: Benchmark Results, https://github.com/Dustin-a11y/quantum-memory-graph/tree/main/benchmarks
|
|
11
|
+
Project-URL: LongMemEval Submission, https://github.com/xiaowu0162/LongMemEval/issues
|
|
8
12
|
Keywords: quantum,memory,knowledge-graph,agents,qaoa,ai
|
|
9
13
|
Classifier: Development Status :: 4 - Beta
|
|
10
14
|
Classifier: Intended Audience :: Developers
|
|
11
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
12
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
23
|
Requires-Python: >=3.9
|
|
14
24
|
Description-Content-Type: text/markdown
|
|
15
25
|
License-File: LICENSE
|
|
@@ -37,9 +47,9 @@ Every memory system treats memories as independent documents — search, rank, s
|
|
|
37
47
|
|
|
38
48
|
Quantum Memory Graph maps these relationships, then uses QAOA to find the optimal *combination* of memories — not just the most relevant individuals, but the best connected subgraph that gives your agent maximum context.
|
|
39
49
|
|
|
40
|
-
## Benchmark: MemCombine
|
|
50
|
+
## Benchmark: MemCombine (Internal — Memory Combination)
|
|
41
51
|
|
|
42
|
-
|
|
52
|
+
MemCombine tests what no existing benchmark measures — **memory combination quality**, where QAOA graph selection finds coherent subsets that embedding similarity misses.
|
|
43
53
|
|
|
44
54
|
| Method | Coverage | Evidence Recall | F1 | Perfect |
|
|
45
55
|
|--------|----------|----------------|----|---------|
|
|
@@ -48,6 +58,13 @@ We created MemCombine to test what no existing benchmark measures — **memory c
|
|
|
48
58
|
| **Advantage** | **+26.8%** | **+25.4%** | **+24.5%** | |
|
|
49
59
|
|
|
50
60
|
When the task is "find memories that work *together*," graph-aware quantum selection crushes pure similarity search.
|
|
61
|
+
|
|
62
|
+
> **How to read this table:** The R@5/R@10 numbers are driven by QMG's chunked
|
|
63
|
+
> embedding retrieval pipeline (Stage 1: gte-large, 500-char chunks, mean-of-top-3
|
|
64
|
+
> scoring). QAOA (Stage 2) refines the top-14 candidates for relationship-aware
|
|
65
|
+
> selection — its advantage shows up in MemCombine (combination quality) rather
|
|
66
|
+
> than raw recall rank. The pipeline as a whole achieves #1.
|
|
67
|
+
|
|
51
68
|
## 🏆 #1 on LongMemEval (ICLR 2025 Benchmark)
|
|
52
69
|
|
|
53
70
|
Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813) for long-term memory in AI agents:
|
|
@@ -57,7 +74,7 @@ Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813)
|
|
|
57
74
|
| OMEGA (prev SOTA) | — | 89.2% | 94.1% | 87.5% |
|
|
58
75
|
| Mastra OM | — | 91.0% | 95.2% | 89.1% |
|
|
59
76
|
| **QMG v1.1 (published #1)** | — | **95.8%** | **98.85%** | **93.2%** |
|
|
60
|
-
| **QMG v1.2
|
|
77
|
+
| **QMG v1.2 — chunked retrieval pipeline** 🏆 | **90.6%** | **98.6%** | **99.4%** | **94.26%** |
|
|
61
78
|
|
|
62
79
|
**Benchmark run:** 500 questions, chunked gte-large embeddings (500-char blocks, 100-char overlap, mean-of-top-3 session scoring). Verified on DGX Spark GB10 (CUDA, ~53 min).
|
|
63
80
|
|
|
@@ -65,7 +82,6 @@ Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813)
|
|
|
65
82
|
|
|
66
83
|
**See:** `benchmarks/run_longmemeval_chunked_staged.py` for the exact benchmark code, `benchmarks/longmemeval_chunked_staged_results.json` for full per-question results.
|
|
67
84
|
|
|
68
|
-
|
|
69
85
|
## Install
|
|
70
86
|
|
|
71
87
|
```bash
|
|
@@ -6,9 +6,9 @@ Every memory system treats memories as independent documents — search, rank, s
|
|
|
6
6
|
|
|
7
7
|
Quantum Memory Graph maps these relationships, then uses QAOA to find the optimal *combination* of memories — not just the most relevant individuals, but the best connected subgraph that gives your agent maximum context.
|
|
8
8
|
|
|
9
|
-
## Benchmark: MemCombine
|
|
9
|
+
## Benchmark: MemCombine (Internal — Memory Combination)
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
MemCombine tests what no existing benchmark measures — **memory combination quality**, where QAOA graph selection finds coherent subsets that embedding similarity misses.
|
|
12
12
|
|
|
13
13
|
| Method | Coverage | Evidence Recall | F1 | Perfect |
|
|
14
14
|
|--------|----------|----------------|----|---------|
|
|
@@ -17,6 +17,13 @@ We created MemCombine to test what no existing benchmark measures — **memory c
|
|
|
17
17
|
| **Advantage** | **+26.8%** | **+25.4%** | **+24.5%** | |
|
|
18
18
|
|
|
19
19
|
When the task is "find memories that work *together*," graph-aware quantum selection crushes pure similarity search.
|
|
20
|
+
|
|
21
|
+
> **How to read this table:** The R@5/R@10 numbers are driven by QMG's chunked
|
|
22
|
+
> embedding retrieval pipeline (Stage 1: gte-large, 500-char chunks, mean-of-top-3
|
|
23
|
+
> scoring). QAOA (Stage 2) refines the top-14 candidates for relationship-aware
|
|
24
|
+
> selection — its advantage shows up in MemCombine (combination quality) rather
|
|
25
|
+
> than raw recall rank. The pipeline as a whole achieves #1.
|
|
26
|
+
|
|
20
27
|
## 🏆 #1 on LongMemEval (ICLR 2025 Benchmark)
|
|
21
28
|
|
|
22
29
|
Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813) for long-term memory in AI agents:
|
|
@@ -26,7 +33,7 @@ Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813)
|
|
|
26
33
|
| OMEGA (prev SOTA) | — | 89.2% | 94.1% | 87.5% |
|
|
27
34
|
| Mastra OM | — | 91.0% | 95.2% | 89.1% |
|
|
28
35
|
| **QMG v1.1 (published #1)** | — | **95.8%** | **98.85%** | **93.2%** |
|
|
29
|
-
| **QMG v1.2
|
|
36
|
+
| **QMG v1.2 — chunked retrieval pipeline** 🏆 | **90.6%** | **98.6%** | **99.4%** | **94.26%** |
|
|
30
37
|
|
|
31
38
|
**Benchmark run:** 500 questions, chunked gte-large embeddings (500-char blocks, 100-char overlap, mean-of-top-3 session scoring). Verified on DGX Spark GB10 (CUDA, ~53 min).
|
|
32
39
|
|
|
@@ -34,7 +41,6 @@ Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813)
|
|
|
34
41
|
|
|
35
42
|
**See:** `benchmarks/run_longmemeval_chunked_staged.py` for the exact benchmark code, `benchmarks/longmemeval_chunked_staged_results.json` for full per-question results.
|
|
36
43
|
|
|
37
|
-
|
|
38
44
|
## Install
|
|
39
45
|
|
|
40
46
|
```bash
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LongMemEval 500-question Benchmark — QMG CVaR subgraph optimizer.
|
|
4
|
+
|
|
5
|
+
Routes each question through the QMG subgraph optimizer on Spark.
|
|
6
|
+
Measures recall@K against gold answer sessions.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 -u run_longmemeval_cvar.py --limit 5 # Quick test
|
|
10
|
+
python3 -u run_longmemeval_cvar.py --force # Full 500
|
|
11
|
+
python3 -u run_longmemeval_cvar.py --fast # Skip QMG, cosine only
|
|
12
|
+
|
|
13
|
+
Output: JSON results + CSV saved to benchmarks/ directory.
|
|
14
|
+
"""
|
|
15
|
+
import json, time, math, sys, os, argparse, csv
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
DATA_PATH = "/home/dt/projects-shared/LongMemEval/data/longmemeval_s_cleaned.json"
|
|
20
|
+
RESULTS_DIR = "/home/dt/qmg-v1/benchmarks"
|
|
21
|
+
RESULTS_FILE = os.path.join(RESULTS_DIR, "longmemeval_cvar_results.json")
|
|
22
|
+
CSV_FILE = os.path.join(RESULTS_DIR, "longmemeval_cvar_results.csv")
|
|
23
|
+
|
|
24
|
+
T_START = time.time()
|
|
25
|
+
|
|
26
|
+
def flatten_session(session):
|
|
27
|
+
if isinstance(session, str): return session
|
|
28
|
+
if isinstance(session, list):
|
|
29
|
+
parts = []
|
|
30
|
+
for turn in session:
|
|
31
|
+
if isinstance(turn, dict):
|
|
32
|
+
parts.append(f"{turn.get('role','')}: {turn.get('content', turn.get('text', str(turn)))}")
|
|
33
|
+
else:
|
|
34
|
+
parts.append(str(turn))
|
|
35
|
+
return "\n".join(parts)
|
|
36
|
+
return str(session)
|
|
37
|
+
|
|
38
|
+
def load_data(path, limit=None):
|
|
39
|
+
with open(path) as f: data = json.load(f)
|
|
40
|
+
if not isinstance(data, list):
|
|
41
|
+
for k in ["data","questions","items","results"]:
|
|
42
|
+
if k in data: data = data[k]; break
|
|
43
|
+
if limit: data = data[:limit]
|
|
44
|
+
return data
|
|
45
|
+
|
|
46
|
+
def recall_at_k(ranked, gold, K):
|
|
47
|
+
gold_set = set(gold)
|
|
48
|
+
if not gold_set: return 1.0
|
|
49
|
+
return 1.0 if set(ranked[:K]) & gold_set else 0.0
|
|
50
|
+
|
|
51
|
+
def ndcg_at_k(ranked, gold, K):
|
|
52
|
+
gold_set = set(gold)
|
|
53
|
+
if not gold_set: return 1.0
|
|
54
|
+
dcg = sum(1.0/math.log2(i+2) for i,idx in enumerate(ranked[:K]) if idx in gold_set)
|
|
55
|
+
idcg = sum(1.0/math.log2(i+2) for i in range(min(len(gold_set), K)))
|
|
56
|
+
return dcg/idcg if idcg>0 else 0.0
|
|
57
|
+
|
|
58
|
+
def main():
|
|
59
|
+
parser = argparse.ArgumentParser()
|
|
60
|
+
parser.add_argument("--limit", type=int, default=None)
|
|
61
|
+
parser.add_argument("--fast", action="store_true", help="Skip QMG, cosine only")
|
|
62
|
+
parser.add_argument("--force", action="store_true", help="Run full 500")
|
|
63
|
+
args = parser.parse_args()
|
|
64
|
+
|
|
65
|
+
data = load_data(DATA_PATH)
|
|
66
|
+
print(f"Loaded {len(data)} questions", flush=True)
|
|
67
|
+
|
|
68
|
+
limit = args.limit
|
|
69
|
+
if args.force: limit = None
|
|
70
|
+
if limit: data = data[:limit]
|
|
71
|
+
|
|
72
|
+
from sentence_transformers import SentenceTransformer
|
|
73
|
+
import torch
|
|
74
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
75
|
+
print(f"Loading gte-large on {device}...", flush=True)
|
|
76
|
+
model = SentenceTransformer("thenlper/gte-large", device=device)
|
|
77
|
+
dim = model.get_sentence_embedding_dimension()
|
|
78
|
+
print(f"Model loaded, dim={dim}", flush=True)
|
|
79
|
+
|
|
80
|
+
results = []
|
|
81
|
+
n_questions = len(data)
|
|
82
|
+
|
|
83
|
+
for idx, item in enumerate(data):
|
|
84
|
+
question = item.get("question", item.get("query", ""))
|
|
85
|
+
haystack = item.get("haystack_sessions", item.get("sessions", item.get("corpus", [])))
|
|
86
|
+
haystack_ids = item.get("haystack_session_ids", item.get("session_ids", []))
|
|
87
|
+
answer_ids = item.get("answer_session_ids", item.get("answer_ids", []))
|
|
88
|
+
|
|
89
|
+
gold_indices = []
|
|
90
|
+
for g in answer_ids:
|
|
91
|
+
try: gold_indices.append(haystack_ids.index(g))
|
|
92
|
+
except ValueError: pass
|
|
93
|
+
|
|
94
|
+
if not gold_indices or len(haystack) < 3:
|
|
95
|
+
results.append({"idx": idx, "skip": True, "reason": "no_gold_or_too_few"})
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
texts = [flatten_session(s) for s in haystack]
|
|
99
|
+
|
|
100
|
+
# Encode
|
|
101
|
+
t0 = time.time()
|
|
102
|
+
all_texts = [question] + texts
|
|
103
|
+
embs = model.encode(all_texts, normalize_embeddings=True, batch_size=32, show_progress_bar=False)
|
|
104
|
+
q_emb = embs[0]
|
|
105
|
+
sess_embs = embs[1:]
|
|
106
|
+
encode_time = time.time() - t0
|
|
107
|
+
|
|
108
|
+
n_sessions = len(sess_embs)
|
|
109
|
+
K_target = min(5, n_sessions)
|
|
110
|
+
|
|
111
|
+
# Cosine baseline
|
|
112
|
+
t0 = time.time()
|
|
113
|
+
cos_scores = q_emb @ sess_embs.T
|
|
114
|
+
cos_ranked = np.argsort(cos_scores)[::-1].tolist()
|
|
115
|
+
cos_time = time.time() - t0
|
|
116
|
+
|
|
117
|
+
r = {
|
|
118
|
+
"idx": idx,
|
|
119
|
+
"question": question[:120],
|
|
120
|
+
"n_sessions": n_sessions,
|
|
121
|
+
"n_gold": len(gold_indices),
|
|
122
|
+
"cosine": {
|
|
123
|
+
"r1": float(recall_at_k(cos_ranked, gold_indices, 1)),
|
|
124
|
+
"r5": float(recall_at_k(cos_ranked, gold_indices, 5)),
|
|
125
|
+
"r10": float(recall_at_k(cos_ranked, gold_indices, 10)),
|
|
126
|
+
"ndcg": float(ndcg_at_k(cos_ranked, gold_indices, 10)),
|
|
127
|
+
"time": cos_time,
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# QMG CVaR optimizer — two configs
|
|
132
|
+
if not args.fast:
|
|
133
|
+
t0 = time.time()
|
|
134
|
+
try:
|
|
135
|
+
sys.path.insert(0, "/home/dt/qmg-v1")
|
|
136
|
+
from quantum_memory_graph.subgraph_optimizer import optimize_subgraph
|
|
137
|
+
|
|
138
|
+
# Build adjacency from session embeddings (cosine similarity matrix)
|
|
139
|
+
adj = sess_embs @ sess_embs.T
|
|
140
|
+
np.fill_diagonal(adj, 0.0)
|
|
141
|
+
|
|
142
|
+
for cfg_name, cfg in [
|
|
143
|
+
("default", {"alpha": 0.4, "beta_conn": 0.35, "gamma_cov": 0.25, "shots": 4096}),
|
|
144
|
+
("retrieval", {"alpha": 1.0, "beta_conn": 0.0, "gamma_cov": 0.0, "shots": 4096}),
|
|
145
|
+
]:
|
|
146
|
+
# Cap candidates at 14 for QAOA to avoid memory OOM
|
|
147
|
+
# (2^14 = 16K complex numbers, 2^40 = 17TB)
|
|
148
|
+
top_indices = np.argsort(cos_scores)[::-1][:14]
|
|
149
|
+
top_scores = cos_scores[top_indices]
|
|
150
|
+
top_adj = adj[np.ix_(top_indices, top_indices)]
|
|
151
|
+
|
|
152
|
+
result = optimize_subgraph(
|
|
153
|
+
relevance_scores=top_scores,
|
|
154
|
+
adjacency=top_adj,
|
|
155
|
+
K=K_target,
|
|
156
|
+
alpha=cfg["alpha"],
|
|
157
|
+
beta_conn=cfg["beta_conn"],
|
|
158
|
+
gamma_cov=cfg["gamma_cov"],
|
|
159
|
+
grid_size=6,
|
|
160
|
+
shots=cfg["shots"],
|
|
161
|
+
p_layers=2,
|
|
162
|
+
)
|
|
163
|
+
selection_raw = result.get("selection", [])
|
|
164
|
+
method = result.get("method", "unknown")
|
|
165
|
+
|
|
166
|
+
# Map capped indices back to original indices
|
|
167
|
+
selection = [top_indices[s] for s in selection_raw]
|
|
168
|
+
|
|
169
|
+
sel_set = set(selection)
|
|
170
|
+
ranked = list(selection)
|
|
171
|
+
for i in range(n_sessions):
|
|
172
|
+
if len(ranked) >= n_sessions: break
|
|
173
|
+
if i not in sel_set: ranked.append(i)
|
|
174
|
+
|
|
175
|
+
r[cfg_name] = {
|
|
176
|
+
"r1": float(recall_at_k(ranked, gold_indices, 1)),
|
|
177
|
+
"r5": float(recall_at_k(ranked, gold_indices, 5)),
|
|
178
|
+
"r10": float(recall_at_k(ranked, gold_indices, 10)),
|
|
179
|
+
"ndcg": float(ndcg_at_k(ranked, gold_indices, 10)),
|
|
180
|
+
"method": method,
|
|
181
|
+
"n_capped": len(top_indices),
|
|
182
|
+
"score": float(result.get("score", 0)),
|
|
183
|
+
"optimal_score": float(result.get("optimal", {}).get("score", 0)),
|
|
184
|
+
"time": time.time() - t0,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
except Exception as e:
|
|
188
|
+
import traceback
|
|
189
|
+
r["qmg_error"] = f"{type(e).__name__}: {e}"
|
|
190
|
+
r["qmg_traceback"] = traceback.format_exc()
|
|
191
|
+
|
|
192
|
+
r["total_qmg_time"] = time.time() - t0
|
|
193
|
+
|
|
194
|
+
results.append(r)
|
|
195
|
+
|
|
196
|
+
# Progress every 5 questions
|
|
197
|
+
if (idx+1) % 5 == 0:
|
|
198
|
+
elapsed = time.time() - T_START
|
|
199
|
+
effective = [rr for rr in results if not rr.get("skip")]
|
|
200
|
+
cos_done = [rr for rr in effective if "cosine" in rr]
|
|
201
|
+
if cos_done:
|
|
202
|
+
cos_r5_avg = np.mean([rr["cosine"]["r5"] for rr in cos_done]) * 100
|
|
203
|
+
print(f"[{idx+1}/{n_questions}] {elapsed:.0f}s cos_r5={cos_r5_avg:.1f}%", flush=True)
|
|
204
|
+
|
|
205
|
+
# Summary
|
|
206
|
+
effective = [r for r in results if not r.get("skip")]
|
|
207
|
+
|
|
208
|
+
cos_items = [r for r in effective if "cosine" in r]
|
|
209
|
+
print("\n" + "="*60, flush=True)
|
|
210
|
+
print(f"LONGMEMEVAL — {datetime.now(timezone.utc).isoformat()}", flush=True)
|
|
211
|
+
print(f"Questions: {len(effective)} effective ({len(results)-len(effective)} skipped)", flush=True)
|
|
212
|
+
|
|
213
|
+
if cos_items:
|
|
214
|
+
cos_r1 = np.mean([r["cosine"]["r1"] for r in cos_items])*100
|
|
215
|
+
cos_r5 = np.mean([r["cosine"]["r5"] for r in cos_items])*100
|
|
216
|
+
cos_r10 = np.mean([r["cosine"]["r10"] for r in cos_items])*100
|
|
217
|
+
cos_ndcg = np.mean([r["cosine"]["ndcg"] for r in cos_items])
|
|
218
|
+
print(f"\nCOSINE BASELINE:", flush=True)
|
|
219
|
+
print(f" R@1: {cos_r1:.1f}%", flush=True)
|
|
220
|
+
print(f" R@5: {cos_r5:.1f}%", flush=True)
|
|
221
|
+
print(f" R@10: {cos_r10:.1f}%", flush=True)
|
|
222
|
+
print(f" NDCG: {cos_ndcg:.4f}", flush=True)
|
|
223
|
+
|
|
224
|
+
for cfg_name in ["default", "retrieval"]:
|
|
225
|
+
items = [r for r in effective if cfg_name in r]
|
|
226
|
+
if items:
|
|
227
|
+
r1 = np.mean([r[cfg_name]["r1"] for r in items])*100
|
|
228
|
+
r5 = np.mean([r[cfg_name]["r5"] for r in items])*100
|
|
229
|
+
r10 = np.mean([r[cfg_name]["r10"] for r in items])*100
|
|
230
|
+
ndcg = np.mean([r[cfg_name]["ndcg"] for r in items])
|
|
231
|
+
methods = {}
|
|
232
|
+
for r in items:
|
|
233
|
+
m = r[cfg_name].get("method", "?")
|
|
234
|
+
methods.setdefault(m, []).append(r[cfg_name]["r5"])
|
|
235
|
+
avg_time = np.mean([r[cfg_name]["time"] for r in items])
|
|
236
|
+
print(f"\nQMG {cfg_name.upper()}:", flush=True)
|
|
237
|
+
print(f" R@1: {r1:.1f}%", flush=True)
|
|
238
|
+
print(f" R@5: {r5:.1f}%", flush=True)
|
|
239
|
+
print(f" R@10: {r10:.1f}%", flush=True)
|
|
240
|
+
print(f" NDCG: {ndcg:.4f}", flush=True)
|
|
241
|
+
print(f" Avg time: {avg_time:.1f}s", flush=True)
|
|
242
|
+
for m, vals in sorted(methods.items()):
|
|
243
|
+
print(f" {m}: {len(vals)}x R@5={np.mean(vals)*100:.1f}%", flush=True)
|
|
244
|
+
|
|
245
|
+
total_t = time.time() - T_START
|
|
246
|
+
print(f"\nTotal: {total_t:.0f}s ({total_t/60:.1f} min)", flush=True)
|
|
247
|
+
print("="*60, flush=True)
|
|
248
|
+
|
|
249
|
+
with open(RESULTS_FILE, "w") as f: json.dump({"timestamp": datetime.now(timezone.utc).isoformat(), "n_total": len(data), "results": results}, f, indent=2, default=str)
|
|
250
|
+
print(f"\nSaved to {RESULTS_FILE}", flush=True)
|
|
251
|
+
|
|
252
|
+
with open(CSV_FILE, "w", newline="") as f:
|
|
253
|
+
w = csv.writer(f)
|
|
254
|
+
w.writerow(["idx","n","ngold","cr1","cr5","cr10","cndcg",
|
|
255
|
+
"dr1","dr5","dr10","dndcg","dmethod",
|
|
256
|
+
"rr1","rr5","rr10","rndcg","rmethod"])
|
|
257
|
+
for r in results:
|
|
258
|
+
if r.get("skip"): continue
|
|
259
|
+
w.writerow([
|
|
260
|
+
r["idx"], r["n_sessions"], r["n_gold"],
|
|
261
|
+
r["cosine"]["r1"], r["cosine"]["r5"], r["cosine"]["r10"], r["cosine"]["ndcg"],
|
|
262
|
+
r.get("default", {}).get("r1"), r.get("default", {}).get("r5"),
|
|
263
|
+
r.get("default", {}).get("r10"), r.get("default", {}).get("ndcg"),
|
|
264
|
+
r.get("default", {}).get("method"),
|
|
265
|
+
r.get("retrieval", {}).get("r1"), r.get("retrieval", {}).get("r5"),
|
|
266
|
+
r.get("retrieval", {}).get("r10"), r.get("retrieval", {}).get("ndcg"),
|
|
267
|
+
r.get("retrieval", {}).get("method"),
|
|
268
|
+
])
|
|
269
|
+
print(f"CSV saved to {CSV_FILE}", flush=True)
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
main()
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LongMemEval 500 — Two-Stage Pipeline Benchmark.
|
|
4
|
+
|
|
5
|
+
Stage 1: gte-large cosine similarity -> candidate ranking
|
|
6
|
+
Stage 2: QAOA+CVaR subgraph refinement on top candidates
|
|
7
|
+
|
|
8
|
+
Measures: pure cosine vs cosine+QAOA refinement vs greedy subgraph
|
|
9
|
+
|
|
10
|
+
DK 🦍
|
|
11
|
+
"""
|
|
12
|
+
import json, time, math, sys, os, argparse, csv
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
DATA_PATH = "/home/dt/projects-shared/LongMemEval/data/longmemeval_s_cleaned.json"
|
|
17
|
+
RESULTS_DIR = "/home/dt/qmg-v1/benchmarks"
|
|
18
|
+
RESULTS_FILE = os.path.join(RESULTS_DIR, "longmemeval_staged_results.json")
|
|
19
|
+
CSV_FILE = os.path.join(RESULTS_DIR, "longmemeval_staged_results.csv")
|
|
20
|
+
|
|
21
|
+
T_START = time.time()
|
|
22
|
+
|
|
23
|
+
def flatten_session(session):
|
|
24
|
+
if isinstance(session, str): return session
|
|
25
|
+
if isinstance(session, list):
|
|
26
|
+
parts = []
|
|
27
|
+
for turn in session:
|
|
28
|
+
if isinstance(turn, dict):
|
|
29
|
+
parts.append("%s: %s" % (turn.get('role',''), turn.get('content', turn.get('text', str(turn)))))
|
|
30
|
+
else:
|
|
31
|
+
parts.append(str(turn))
|
|
32
|
+
return "\n".join(parts)
|
|
33
|
+
return str(session)
|
|
34
|
+
|
|
35
|
+
def load_data(path, limit=None):
|
|
36
|
+
with open(path) as f: data = json.load(f)
|
|
37
|
+
if not isinstance(data, list):
|
|
38
|
+
for k in ["data","questions","items","results"]:
|
|
39
|
+
if k in data: data = data[k]; break
|
|
40
|
+
if limit: data = data[:limit]
|
|
41
|
+
return data
|
|
42
|
+
|
|
43
|
+
def recall_at_k(ranked, gold, K):
|
|
44
|
+
gold_set = set(gold)
|
|
45
|
+
if not gold_set: return 1.0
|
|
46
|
+
return 1.0 if set(ranked[:K]) & gold_set else 0.0
|
|
47
|
+
|
|
48
|
+
def ndcg_at_k(ranked, gold, K):
|
|
49
|
+
gold_set = set(gold)
|
|
50
|
+
if not gold_set: return 1.0
|
|
51
|
+
dcg = sum(1.0/math.log2(i+2) for i,idx in enumerate(ranked[:K]) if idx in gold_set)
|
|
52
|
+
idcg = sum(1.0/math.log2(i+2) for i in range(min(len(gold_set), K)))
|
|
53
|
+
return dcg/idcg if idcg>0 else 0.0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def main():
|
|
57
|
+
parser = argparse.ArgumentParser()
|
|
58
|
+
parser.add_argument("--limit", type=int, default=None)
|
|
59
|
+
parser.add_argument("--force", action="store_true", help="Run full 500")
|
|
60
|
+
parser.add_argument("--max-candidates", type=int, default=14, help="QAOA candidate pool size")
|
|
61
|
+
parser.add_argument("--top-k", type=int, default=5, help="Target selection K")
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
data = load_data(DATA_PATH)
|
|
65
|
+
print("Loaded %d questions" % len(data), flush=True)
|
|
66
|
+
|
|
67
|
+
limit = args.limit
|
|
68
|
+
if args.force: limit = None
|
|
69
|
+
if limit: data = data[:limit]
|
|
70
|
+
|
|
71
|
+
from sentence_transformers import SentenceTransformer
|
|
72
|
+
import torch
|
|
73
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
74
|
+
print("Loading gte-large on %s..." % device, flush=True)
|
|
75
|
+
model = SentenceTransformer("thenlper/gte-large", device=device)
|
|
76
|
+
dim = model.get_sentence_embedding_dimension()
|
|
77
|
+
print("Model loaded, dim=%d" % dim, flush=True)
|
|
78
|
+
|
|
79
|
+
results = []
|
|
80
|
+
n_questions = len(data)
|
|
81
|
+
max_candidates = args.max_candidates
|
|
82
|
+
top_k = args.top_k
|
|
83
|
+
|
|
84
|
+
# Trackers
|
|
85
|
+
count_qaoa_won = 0
|
|
86
|
+
count_greedy_won = 0
|
|
87
|
+
count_tied = 0
|
|
88
|
+
count_qaoa_runs = 0
|
|
89
|
+
|
|
90
|
+
for idx, item in enumerate(data):
|
|
91
|
+
question = item.get("question", item.get("query", ""))
|
|
92
|
+
haystack = item.get("haystack_sessions", item.get("sessions", item.get("corpus", [])))
|
|
93
|
+
haystack_ids = item.get("haystack_session_ids", item.get("session_ids", []))
|
|
94
|
+
answer_ids = item.get("answer_session_ids", item.get("answer_ids", []))
|
|
95
|
+
|
|
96
|
+
gold_indices = []
|
|
97
|
+
for g in answer_ids:
|
|
98
|
+
try: gold_indices.append(haystack_ids.index(g))
|
|
99
|
+
except ValueError: pass
|
|
100
|
+
|
|
101
|
+
if not gold_indices or len(haystack) < 3:
|
|
102
|
+
results.append({"idx": idx, "skip": True, "reason": "no_gold_or_too_few"})
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
texts = [flatten_session(s) for s in haystack]
|
|
106
|
+
|
|
107
|
+
# Encode
|
|
108
|
+
t0 = time.time()
|
|
109
|
+
all_texts = [question] + texts
|
|
110
|
+
embs = model.encode(all_texts, normalize_embeddings=True, batch_size=32, show_progress_bar=False)
|
|
111
|
+
q_emb = embs[0]
|
|
112
|
+
sess_embs = embs[1:]
|
|
113
|
+
encode_time = time.time() - t0
|
|
114
|
+
|
|
115
|
+
n_sessions = len(sess_embs)
|
|
116
|
+
|
|
117
|
+
# --- Stage 1: Cosine ---
|
|
118
|
+
t0 = time.time()
|
|
119
|
+
cos_scores = q_emb @ sess_embs.T
|
|
120
|
+
cos_ranking = np.argsort(cos_scores)[::-1].tolist()
|
|
121
|
+
cos_time = time.time() - t0
|
|
122
|
+
|
|
123
|
+
r = {
|
|
124
|
+
"idx": idx,
|
|
125
|
+
"question": question[:120],
|
|
126
|
+
"n_sessions": n_sessions,
|
|
127
|
+
"n_gold": len(gold_indices),
|
|
128
|
+
"cosine": {
|
|
129
|
+
"r1": float(recall_at_k(cos_ranking, gold_indices, 1)),
|
|
130
|
+
"r5": float(recall_at_k(cos_ranking, gold_indices, 5)),
|
|
131
|
+
"r10": float(recall_at_k(cos_ranking, gold_indices, 10)),
|
|
132
|
+
"ndcg": float(ndcg_at_k(cos_ranking, gold_indices, 10)),
|
|
133
|
+
"time": cos_time,
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# --- Stage 2: QAOA+CVaR refinement on top candidates ---
|
|
138
|
+
try:
|
|
139
|
+
t0 = time.time()
|
|
140
|
+
sys.path.insert(0, "/home/dt/qmg-v1")
|
|
141
|
+
from quantum_memory_graph.subgraph_optimizer import optimize_subgraph
|
|
142
|
+
|
|
143
|
+
# Take candidates from cosine top-N
|
|
144
|
+
top_indices = cos_ranking[:max_candidates]
|
|
145
|
+
top_scores = cos_scores[top_indices]
|
|
146
|
+
|
|
147
|
+
# Build adjacency from top-candidate embeddings
|
|
148
|
+
top_embs = sess_embs[top_indices]
|
|
149
|
+
adj = top_embs @ top_embs.T
|
|
150
|
+
np.fill_diagonal(adj, 0.0)
|
|
151
|
+
|
|
152
|
+
# Methods to compare
|
|
153
|
+
for method_name, cfg in [
|
|
154
|
+
("qaoa_cvar", {"alpha": 0.4, "beta_conn": 0.35, "gamma_cov": 0.25, "shots": 4096, "p_layers": 2}),
|
|
155
|
+
("greedy_subgraph", {"alpha": 0.4, "beta_conn": 0.35, "gamma_cov": 0.25}),
|
|
156
|
+
]:
|
|
157
|
+
result = optimize_subgraph(
|
|
158
|
+
relevance_scores=top_scores,
|
|
159
|
+
adjacency=adj,
|
|
160
|
+
K=top_k,
|
|
161
|
+
alpha=cfg["alpha"],
|
|
162
|
+
beta_conn=cfg["beta_conn"],
|
|
163
|
+
gamma_cov=cfg["gamma_cov"],
|
|
164
|
+
grid_size=6,
|
|
165
|
+
shots=cfg.get("shots", 4096),
|
|
166
|
+
p_layers=cfg.get("p_layers", 2),
|
|
167
|
+
)
|
|
168
|
+
selection_raw = result.get("selection", [])
|
|
169
|
+
opt_method = result.get("method", "unknown")
|
|
170
|
+
|
|
171
|
+
# Map capped indices back
|
|
172
|
+
selection = [top_indices[s] for s in selection_raw]
|
|
173
|
+
|
|
174
|
+
# Build ranked list: QAOA picks first, then remaining in cosine order
|
|
175
|
+
sel_set = set(selection)
|
|
176
|
+
ranked = list(selection)
|
|
177
|
+
for i in cos_ranking:
|
|
178
|
+
if len(ranked) >= n_sessions: break
|
|
179
|
+
if i not in sel_set:
|
|
180
|
+
ranked.append(i)
|
|
181
|
+
|
|
182
|
+
r[method_name] = {
|
|
183
|
+
"r1": float(recall_at_k(ranked, gold_indices, 1)),
|
|
184
|
+
"r5": float(recall_at_k(ranked, gold_indices, 5)),
|
|
185
|
+
"r10": float(recall_at_k(ranked, gold_indices, 10)),
|
|
186
|
+
"ndcg": float(ndcg_at_k(ranked, gold_indices, 10)),
|
|
187
|
+
"method": opt_method,
|
|
188
|
+
"n_capped": len(top_indices),
|
|
189
|
+
"score": float(result.get("score", 0)),
|
|
190
|
+
"optimal_score": float(result.get("optimal", {}).get("score", 0)),
|
|
191
|
+
"time": time.time() - t0,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# Determine winner between QAOA and cosine
|
|
195
|
+
q_r5 = r.get("qaoa_cvar", {}).get("r5", 0)
|
|
196
|
+
c_r5 = r["cosine"]["r5"]
|
|
197
|
+
if q_r5 > c_r5:
|
|
198
|
+
r["stage2_winner"] = "qaoa_cvar"
|
|
199
|
+
count_qaoa_won += 1
|
|
200
|
+
elif c_r5 > q_r5:
|
|
201
|
+
r["stage2_winner"] = "cosine"
|
|
202
|
+
count_greedy_won += 1
|
|
203
|
+
else:
|
|
204
|
+
r["stage2_winner"] = "tie"
|
|
205
|
+
count_tied += 1
|
|
206
|
+
|
|
207
|
+
if r.get("qaoa_cvar", {}).get("method") == "qaoa":
|
|
208
|
+
count_qaoa_runs += 1
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
import traceback
|
|
212
|
+
r["stage2_error"] = "%s: %s" % (type(e).__name__, e)
|
|
213
|
+
r["stage2_traceback"] = traceback.format_exc()
|
|
214
|
+
|
|
215
|
+
results.append(r)
|
|
216
|
+
|
|
217
|
+
# Progress
|
|
218
|
+
if (idx+1) % 5 == 0:
|
|
219
|
+
elapsed = time.time() - T_START
|
|
220
|
+
effective = [rr for rr in results if not rr.get("skip")]
|
|
221
|
+
if effective:
|
|
222
|
+
c_r5_avg = np.mean([rr["cosine"]["r5"] for rr in effective]) * 100
|
|
223
|
+
q_r5_avg = np.mean([rr.get("qaoa_cvar", {}).get("r5", 0) for rr in effective if "qaoa_cvar" in rr]) * 100
|
|
224
|
+
q_wins = sum(1 for rr in effective if rr.get("stage2_winner") == "qaoa_cvar")
|
|
225
|
+
print("[%d/%d] %.0fs | cos_r5=%.1f%% | qaoa_r5=%.1f%% | qaoa_wins=%d" % (
|
|
226
|
+
idx+1, n_questions, elapsed, c_r5_avg, q_r5_avg, q_wins), flush=True)
|
|
227
|
+
|
|
228
|
+
# Summary
|
|
229
|
+
effective = [r for r in results if not r.get("skip")]
|
|
230
|
+
n_eff = len(effective)
|
|
231
|
+
|
|
232
|
+
print("\n" + "=" * 80, flush=True)
|
|
233
|
+
print("LONGMEMEVAL TWO-STAGE — %s" % datetime.now(timezone.utc).isoformat(), flush=True)
|
|
234
|
+
print("Questions: %d effective (%d skipped)" % (n_eff, n_questions - n_eff), flush=True)
|
|
235
|
+
print("Max candidates: %d, Target K: %d" % (max_candidates, top_k), flush=True)
|
|
236
|
+
print()
|
|
237
|
+
|
|
238
|
+
# Stage 1: Pure cosine
|
|
239
|
+
cos_items = [r for r in effective if "cosine" in r]
|
|
240
|
+
if cos_items:
|
|
241
|
+
cos = {
|
|
242
|
+
"r1": np.mean([r["cosine"]["r1"] for r in cos_items]) * 100,
|
|
243
|
+
"r5": np.mean([r["cosine"]["r5"] for r in cos_items]) * 100,
|
|
244
|
+
"r10": np.mean([r["cosine"]["r10"] for r in cos_items]) * 100,
|
|
245
|
+
"ndcg": np.mean([r["cosine"]["ndcg"] for r in cos_items]),
|
|
246
|
+
}
|
|
247
|
+
print("--- STAGE 1: COSINE BASELINE ---")
|
|
248
|
+
print(" R@1: %.1f%%" % cos["r1"])
|
|
249
|
+
print(" R@5: %.1f%%" % cos["r5"])
|
|
250
|
+
print(" R@10: %.1f%%" % cos["r10"])
|
|
251
|
+
print(" NDCG: %.4f" % cos["ndcg"])
|
|
252
|
+
print()
|
|
253
|
+
|
|
254
|
+
# Stage 2: QAOA+CVaR refinement
|
|
255
|
+
qaoa_items = [r for r in effective if "qaoa_cvar" in r]
|
|
256
|
+
if qaoa_items:
|
|
257
|
+
qaoa = {
|
|
258
|
+
"r1": np.mean([r["qaoa_cvar"]["r1"] for r in qaoa_items]) * 100,
|
|
259
|
+
"r5": np.mean([r["qaoa_cvar"]["r5"] for r in qaoa_items]) * 100,
|
|
260
|
+
"r10": np.mean([r["qaoa_cvar"]["r10"] for r in qaoa_items]) * 100,
|
|
261
|
+
"ndcg": np.mean([r["qaoa_cvar"]["ndcg"] for r in qaoa_items]),
|
|
262
|
+
}
|
|
263
|
+
print("--- STAGE 2: COSINE + QAOA REFINEMENT ---")
|
|
264
|
+
print(" R@1: %.1f%%" % qaoa["r1"])
|
|
265
|
+
print(" R@5: %.1f%%" % qaoa["r5"])
|
|
266
|
+
print(" R@10: %.1f%%" % qaoa["r10"])
|
|
267
|
+
print(" NDCG: %.4f" % qaoa["ndcg"])
|
|
268
|
+
print()
|
|
269
|
+
|
|
270
|
+
# Greedy subgraph baseline
|
|
271
|
+
greedy_items = [r for r in effective if "greedy_subgraph" in r]
|
|
272
|
+
if greedy_items:
|
|
273
|
+
greedy = {
|
|
274
|
+
"r1": np.mean([r["greedy_subgraph"]["r1"] for r in greedy_items]) * 100,
|
|
275
|
+
"r5": np.mean([r["greedy_subgraph"]["r5"] for r in greedy_items]) * 100,
|
|
276
|
+
"r10": np.mean([r["greedy_subgraph"]["r10"] for r in greedy_items]) * 100,
|
|
277
|
+
"ndcg": np.mean([r["greedy_subgraph"]["ndcg"] for r in greedy_items]),
|
|
278
|
+
}
|
|
279
|
+
print("--- BASELINE: COSINE + GREEDY SUBGRAPH ---")
|
|
280
|
+
print(" R@1: %.1f%%" % greedy["r1"])
|
|
281
|
+
print(" R@5: %.1f%%" % greedy["r5"])
|
|
282
|
+
print(" R@10: %.1f%%" % greedy["r10"])
|
|
283
|
+
print(" NDCG: %.4f" % greedy["ndcg"])
|
|
284
|
+
print()
|
|
285
|
+
|
|
286
|
+
# Head-to-head: QAOA vs Cosine
|
|
287
|
+
print("--- HEAD-TO-HEAD (QAOA refinement vs pure cosine) ---")
|
|
288
|
+
print(" Questions where QAOA refinement WINS: %d (%.1f%%)" % (count_qaoa_won, count_qaoa_won/n_eff*100))
|
|
289
|
+
print(" Questions where cosine alone WINS: %d (%.1f%%)" % (count_greedy_won, count_greedy_won/n_eff*100))
|
|
290
|
+
print(" Ties: %d (%.1f%%)" % (count_tied, count_tied/n_eff*100))
|
|
291
|
+
print(" QAOA optimizer ran (%d/%d)" % (count_qaoa_runs, n_eff))
|
|
292
|
+
print()
|
|
293
|
+
|
|
294
|
+
# Delta vs baseline
|
|
295
|
+
if qaoa_items and cos_items:
|
|
296
|
+
delta_r1 = qaoa["r1"] - cos["r1"]
|
|
297
|
+
delta_r5 = qaoa["r5"] - cos["r5"]
|
|
298
|
+
delta_r10 = qaoa["r10"] - cos["r10"]
|
|
299
|
+
print("--- DELTA (stage2 - stage1) ---")
|
|
300
|
+
print(" R@1: %+.1f%%" % delta_r1)
|
|
301
|
+
print(" R@5: %+.1f%%" % delta_r5)
|
|
302
|
+
print(" R@10: %+.1f%%" % delta_r10)
|
|
303
|
+
print()
|
|
304
|
+
|
|
305
|
+
total_t = time.time() - T_START
|
|
306
|
+
print("Total: %.0fs (%.1f min)" % (total_t, total_t/60), flush=True)
|
|
307
|
+
print("=" * 80, flush=True)
|
|
308
|
+
|
|
309
|
+
# Save JSON
|
|
310
|
+
with open(RESULTS_FILE, "w") as f:
|
|
311
|
+
json.dump({
|
|
312
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
313
|
+
"n_total": len(data),
|
|
314
|
+
"config": {"max_candidates": max_candidates, "top_k": top_k},
|
|
315
|
+
"cosine": cos if cos_items else {},
|
|
316
|
+
"qaoa_cvar": qaoa if qaoa_items else {},
|
|
317
|
+
"greedy_subgraph": greedy if greedy_items else {},
|
|
318
|
+
"count_qaoa_won": count_qaoa_won,
|
|
319
|
+
"count_cosine_won": count_greedy_won,
|
|
320
|
+
"count_tied": count_tied,
|
|
321
|
+
"count_qaoa_runs": count_qaoa_runs,
|
|
322
|
+
"results": results,
|
|
323
|
+
}, f, indent=2, default=str)
|
|
324
|
+
print("Saved JSON to %s" % RESULTS_FILE, flush=True)
|
|
325
|
+
|
|
326
|
+
# Save CSV
|
|
327
|
+
with open(CSV_FILE, "w", newline="") as f:
|
|
328
|
+
w = csv.writer(f)
|
|
329
|
+
w.writerow([
|
|
330
|
+
"idx","n","ngold",
|
|
331
|
+
"cr1","cr5","cr10","cndcg",
|
|
332
|
+
"qr1","qr5","qr10","qndcg","qmethod",
|
|
333
|
+
"gr1","gr5","gr10","gndcg","gmethod",
|
|
334
|
+
"winner"
|
|
335
|
+
])
|
|
336
|
+
for r in results:
|
|
337
|
+
if r.get("skip"): continue
|
|
338
|
+
def g(d, key): return d.get(key, "") if d else ""
|
|
339
|
+
w.writerow([
|
|
340
|
+
r["idx"], r["n_sessions"], r["n_gold"],
|
|
341
|
+
g(r.get("cosine"), "r1"), g(r.get("cosine"), "r5"),
|
|
342
|
+
g(r.get("cosine"), "r10"), g(r.get("cosine"), "ndcg"),
|
|
343
|
+
g(r.get("qaoa_cvar"), "r1"), g(r.get("qaoa_cvar"), "r5"),
|
|
344
|
+
g(r.get("qaoa_cvar"), "r10"), g(r.get("qaoa_cvar"), "ndcg"),
|
|
345
|
+
g(r.get("qaoa_cvar"), "method"),
|
|
346
|
+
g(r.get("greedy_subgraph"), "r1"), g(r.get("greedy_subgraph"), "r5"),
|
|
347
|
+
g(r.get("greedy_subgraph"), "r10"), g(r.get("greedy_subgraph"), "ndcg"),
|
|
348
|
+
g(r.get("greedy_subgraph"), "method"),
|
|
349
|
+
r.get("stage2_winner", "?"),
|
|
350
|
+
])
|
|
351
|
+
print("Saved CSV to %s" % CSV_FILE, flush=True)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
if __name__ == "__main__":
|
|
355
|
+
main()
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1/quantum_memory_graph.egg-info}/PKG-INFO
RENAMED
|
@@ -1,15 +1,25 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: quantum-memory-graph
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Quantum-optimized knowledge graph memory for AI agents. Relationship-aware subgraph selection via QAOA.
|
|
5
5
|
Home-page: https://github.com/Dustin-a11y/quantum-memory-graph
|
|
6
6
|
Author: Coinkong (Chef's Attraction)
|
|
7
7
|
License: MIT
|
|
8
|
+
Project-URL: Source Code, https://github.com/Dustin-a11y/quantum-memory-graph
|
|
9
|
+
Project-URL: Issue Tracker, https://github.com/Dustin-a11y/quantum-memory-graph/issues
|
|
10
|
+
Project-URL: Benchmark Results, https://github.com/Dustin-a11y/quantum-memory-graph/tree/main/benchmarks
|
|
11
|
+
Project-URL: LongMemEval Submission, https://github.com/xiaowu0162/LongMemEval/issues
|
|
8
12
|
Keywords: quantum,memory,knowledge-graph,agents,qaoa,ai
|
|
9
13
|
Classifier: Development Status :: 4 - Beta
|
|
10
14
|
Classifier: Intended Audience :: Developers
|
|
11
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
12
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
23
|
Requires-Python: >=3.9
|
|
14
24
|
Description-Content-Type: text/markdown
|
|
15
25
|
License-File: LICENSE
|
|
@@ -37,9 +47,9 @@ Every memory system treats memories as independent documents — search, rank, s
|
|
|
37
47
|
|
|
38
48
|
Quantum Memory Graph maps these relationships, then uses QAOA to find the optimal *combination* of memories — not just the most relevant individuals, but the best connected subgraph that gives your agent maximum context.
|
|
39
49
|
|
|
40
|
-
## Benchmark: MemCombine
|
|
50
|
+
## Benchmark: MemCombine (Internal — Memory Combination)
|
|
41
51
|
|
|
42
|
-
|
|
52
|
+
MemCombine tests what no existing benchmark measures — **memory combination quality**, where QAOA graph selection finds coherent subsets that embedding similarity misses.
|
|
43
53
|
|
|
44
54
|
| Method | Coverage | Evidence Recall | F1 | Perfect |
|
|
45
55
|
|--------|----------|----------------|----|---------|
|
|
@@ -48,6 +58,13 @@ We created MemCombine to test what no existing benchmark measures — **memory c
|
|
|
48
58
|
| **Advantage** | **+26.8%** | **+25.4%** | **+24.5%** | |
|
|
49
59
|
|
|
50
60
|
When the task is "find memories that work *together*," graph-aware quantum selection crushes pure similarity search.
|
|
61
|
+
|
|
62
|
+
> **How to read this table:** The R@5/R@10 numbers are driven by QMG's chunked
|
|
63
|
+
> embedding retrieval pipeline (Stage 1: gte-large, 500-char chunks, mean-of-top-3
|
|
64
|
+
> scoring). QAOA (Stage 2) refines the top-14 candidates for relationship-aware
|
|
65
|
+
> selection — its advantage shows up in MemCombine (combination quality) rather
|
|
66
|
+
> than raw recall rank. The pipeline as a whole achieves #1.
|
|
67
|
+
|
|
51
68
|
## 🏆 #1 on LongMemEval (ICLR 2025 Benchmark)
|
|
52
69
|
|
|
53
70
|
Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813) for long-term memory in AI agents:
|
|
@@ -57,7 +74,7 @@ Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813)
|
|
|
57
74
|
| OMEGA (prev SOTA) | — | 89.2% | 94.1% | 87.5% |
|
|
58
75
|
| Mastra OM | — | 91.0% | 95.2% | 89.1% |
|
|
59
76
|
| **QMG v1.1 (published #1)** | — | **95.8%** | **98.85%** | **93.2%** |
|
|
60
|
-
| **QMG v1.2
|
|
77
|
+
| **QMG v1.2 — chunked retrieval pipeline** 🏆 | **90.6%** | **98.6%** | **99.4%** | **94.26%** |
|
|
61
78
|
|
|
62
79
|
**Benchmark run:** 500 questions, chunked gte-large embeddings (500-char blocks, 100-char overlap, mean-of-top-3 session scoring). Verified on DGX Spark GB10 (CUDA, ~53 min).
|
|
63
80
|
|
|
@@ -65,7 +82,6 @@ Tested on the official [LongMemEval benchmark](https://arxiv.org/abs/2410.10813)
|
|
|
65
82
|
|
|
66
83
|
**See:** `benchmarks/run_longmemeval_chunked_staged.py` for the exact benchmark code, `benchmarks/longmemeval_chunked_staged_results.json` for full per-question results.
|
|
67
84
|
|
|
68
|
-
|
|
69
85
|
## Install
|
|
70
86
|
|
|
71
87
|
```bash
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph.egg-info/SOURCES.txt
RENAMED
|
@@ -18,6 +18,8 @@ benchmarks/run_final.py
|
|
|
18
18
|
benchmarks/run_full_benchmark.py
|
|
19
19
|
benchmarks/run_full_benchmark_v2.py
|
|
20
20
|
benchmarks/run_longmemeval_chunked_staged.py
|
|
21
|
+
benchmarks/run_longmemeval_cvar_v2.py
|
|
22
|
+
benchmarks/run_longmemeval_staged.py
|
|
21
23
|
quantum_memory_graph/__init__.py
|
|
22
24
|
quantum_memory_graph/__main__.py
|
|
23
25
|
quantum_memory_graph/api.py
|
|
@@ -1,18 +1,29 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = quantum-memory-graph
|
|
3
|
-
version = 1.2.
|
|
3
|
+
version = 1.2.1
|
|
4
4
|
description = Quantum-optimized knowledge graph memory for AI agents. Relationship-aware subgraph selection via QAOA.
|
|
5
5
|
long_description = file: README.md
|
|
6
6
|
long_description_content_type = text/markdown
|
|
7
7
|
author = Coinkong (Chef's Attraction)
|
|
8
8
|
license = MIT
|
|
9
9
|
url = https://github.com/Dustin-a11y/quantum-memory-graph
|
|
10
|
+
project_urls =
|
|
11
|
+
Source Code = https://github.com/Dustin-a11y/quantum-memory-graph
|
|
12
|
+
Issue Tracker = https://github.com/Dustin-a11y/quantum-memory-graph/issues
|
|
13
|
+
Benchmark Results = https://github.com/Dustin-a11y/quantum-memory-graph/tree/main/benchmarks
|
|
14
|
+
LongMemEval Submission = https://github.com/xiaowu0162/LongMemEval/issues
|
|
10
15
|
keywords = quantum, memory, knowledge-graph, agents, qaoa, ai
|
|
11
16
|
classifiers =
|
|
12
17
|
Development Status :: 4 - Beta
|
|
13
18
|
Intended Audience :: Developers
|
|
14
19
|
License :: OSI Approved :: MIT License
|
|
15
20
|
Programming Language :: Python :: 3
|
|
21
|
+
Programming Language :: Python :: 3.9
|
|
22
|
+
Programming Language :: Python :: 3.10
|
|
23
|
+
Programming Language :: Python :: 3.11
|
|
24
|
+
Programming Language :: Python :: 3.12
|
|
25
|
+
Programming Language :: Python :: 3.13
|
|
26
|
+
Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
27
|
|
|
17
28
|
[options]
|
|
18
29
|
packages = find:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v2.py
RENAMED
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v3.py
RENAMED
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v4.py
RENAMED
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v5.py
RENAMED
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v6.py
RENAMED
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/longmemeval_bench_v7.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/benchmarks/run_full_benchmark_v2.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/pce_optimizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph/subgraph_optimizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{quantum_memory_graph-1.2.0 → quantum_memory_graph-1.2.1}/quantum_memory_graph.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|