limbo-ai 1.24.9 → 1.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/cli.js +23 -16
- package/docker-compose.test.yml +22 -0
- package/evals/cases/create-reminder.json +22 -0
- package/evals/cases/hard-ambiguous-request.json +12 -0
- package/evals/cases/hard-complex-note.json +17 -0
- package/evals/cases/hard-synthesize-knowledge.json +33 -0
- package/evals/cases/medium-note-type-inference.json +16 -0
- package/evals/cases/medium-person-multiple-facts.json +16 -0
- package/evals/cases/medium-search-implicit.json +13 -0
- package/evals/cases/multi-step-remember-and-search.json +24 -0
- package/evals/cases/read-note-by-id.json +22 -0
- package/evals/cases/remember-fact.json +15 -0
- package/evals/cases/reminder-timezone.json +23 -0
- package/evals/cases/search-existing-note.json +27 -0
- package/evals/cases/update-map.json +28 -0
- package/evals/cases/web-search.json +22 -0
- package/evals/cli.js +477 -0
- package/evals/docker-compose.eval.yml +43 -0
- package/evals/judge/rubrics.json +10 -0
- package/evals/lib/judge.js +69 -0
- package/evals/lib/mcp-log.js +62 -0
- package/evals/lib/scorer.js +153 -0
- package/evals/lib/vault-diff.js +59 -0
- package/evals/results/.gitkeep +0 -0
- package/evals/results/baseline.json +662 -0
- package/evals/results/history/.gitkeep +0 -0
- package/evals/results/history/run-1774559258082.json +662 -0
- package/evals/results/history/run-1774559485256.json +662 -0
- package/evals/results/history/run-1774559674855.json +662 -0
- package/evals/results/latest.json +662 -0
- package/evals/test/scorer.test.js +180 -0
- package/evals/vault-seed/maps/.gitkeep +0 -0
- package/evals/vault-seed/notes/.gitkeep +0 -0
- package/evals/vault-seed/notes/eval-seed-birthday.md +10 -0
- package/mcp-server/index.js +30 -10
- package/mcp-server/test/eval-logging.test.js +254 -0
- package/package.json +3 -2
- package/setup-server/server.js +14 -10
- package/test/cli-auth.test.js +21 -15
- package/test/setup-server.test.js +14 -7
- package/test/zeroclaw-migration.test.js +3 -3
package/README.md
CHANGED
|
@@ -18,7 +18,7 @@ Limbo is a second brain with a conversational interface. It stores atomic notes
|
|
|
18
18
|
|
|
19
19
|
### 1. Provision a server
|
|
20
20
|
|
|
21
|
-
Any Ubuntu/Debian VPS with 1 GB+ RAM.
|
|
21
|
+
Any Ubuntu/Debian VPS with 1 GB+ RAM.
|
|
22
22
|
|
|
23
23
|
### 2. Run the installer
|
|
24
24
|
|
package/cli.js
CHANGED
|
@@ -1297,38 +1297,45 @@ function writeAuthProfilesToDocker(store) {
|
|
|
1297
1297
|
}
|
|
1298
1298
|
|
|
1299
1299
|
function buildCodexAuthProfile(profile) {
|
|
1300
|
-
const
|
|
1300
|
+
const profileName = profile.email || 'default';
|
|
1301
|
+
const profileId = `openai-codex:${profileName}`;
|
|
1302
|
+
const now = new Date().toISOString();
|
|
1301
1303
|
return {
|
|
1302
|
-
|
|
1304
|
+
schema_version: 1,
|
|
1305
|
+
updated_at: now,
|
|
1306
|
+
active_profiles: { 'openai-codex': profileId },
|
|
1303
1307
|
profiles: {
|
|
1304
1308
|
[profileId]: {
|
|
1305
|
-
type: 'oauth',
|
|
1306
1309
|
provider: 'openai-codex',
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1310
|
+
profile_name: profileName,
|
|
1311
|
+
kind: 'oauth',
|
|
1312
|
+
account_id: profile.accountId || null,
|
|
1313
|
+
access_token: profile.access,
|
|
1314
|
+
refresh_token: profile.refresh,
|
|
1315
|
+
expires_at: new Date(profile.expires).toISOString(),
|
|
1316
|
+
created_at: now,
|
|
1317
|
+
updated_at: now,
|
|
1311
1318
|
},
|
|
1312
1319
|
},
|
|
1313
|
-
order: {},
|
|
1314
|
-
lastGood: {},
|
|
1315
|
-
usageStats: {},
|
|
1316
1320
|
};
|
|
1317
1321
|
}
|
|
1318
1322
|
|
|
1319
1323
|
function buildAnthropicAuthProfile(token) {
|
|
1324
|
+
const now = new Date().toISOString();
|
|
1320
1325
|
return {
|
|
1321
|
-
|
|
1326
|
+
schema_version: 1,
|
|
1327
|
+
updated_at: now,
|
|
1328
|
+
active_profiles: { anthropic: 'anthropic:default' },
|
|
1322
1329
|
profiles: {
|
|
1323
|
-
'anthropic:
|
|
1324
|
-
type: 'token',
|
|
1330
|
+
'anthropic:default': {
|
|
1325
1331
|
provider: 'anthropic',
|
|
1332
|
+
profile_name: 'default',
|
|
1333
|
+
kind: 'token',
|
|
1326
1334
|
token,
|
|
1335
|
+
created_at: now,
|
|
1336
|
+
updated_at: now,
|
|
1327
1337
|
},
|
|
1328
1338
|
},
|
|
1329
|
-
order: { anthropic: ['anthropic:token'] },
|
|
1330
|
-
lastGood: {},
|
|
1331
|
-
usageStats: {},
|
|
1332
1339
|
};
|
|
1333
1340
|
}
|
|
1334
1341
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Local testing — setup persists across restarts.
|
|
2
|
+
# Start: docker compose -f docker-compose.test.yml up -d
|
|
3
|
+
# Logs: docker compose -f docker-compose.test.yml logs -f
|
|
4
|
+
# Stop: docker compose -f docker-compose.test.yml down
|
|
5
|
+
# Reset: docker compose -f docker-compose.test.yml down -v (wipes setup)
|
|
6
|
+
services:
|
|
7
|
+
limbo:
|
|
8
|
+
image: limbo:rag-pdf-test
|
|
9
|
+
restart: "no"
|
|
10
|
+
ports:
|
|
11
|
+
- "127.0.0.1:18789:18789"
|
|
12
|
+
volumes:
|
|
13
|
+
- limbo-test-data:/data
|
|
14
|
+
- limbo-test-state:/home/limbo/.zeroclaw
|
|
15
|
+
tmpfs:
|
|
16
|
+
- /tmp:size=100M
|
|
17
|
+
|
|
18
|
+
volumes:
|
|
19
|
+
limbo-test-data:
|
|
20
|
+
name: limbo-test-data
|
|
21
|
+
limbo-test-state:
|
|
22
|
+
name: limbo-test-state
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "create-reminder",
|
|
3
|
+
"description": "User asks Limbo to set a reminder — should create a cron job, not a vault note",
|
|
4
|
+
"input": "Recordame mañana a las 9am que tengo que llamar al banco",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"type": "cron_created",
|
|
8
|
+
"pattern": "banco|bank"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"type": "response_matches",
|
|
12
|
+
"pattern": "(?i)(reminder|recordatorio|avisarte|cron|programado|mañana)"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"runs": 1,
|
|
16
|
+
"pass_threshold": 1.0,
|
|
17
|
+
"tags": [
|
|
18
|
+
"cron",
|
|
19
|
+
"reminder"
|
|
20
|
+
],
|
|
21
|
+
"difficulty": "easy"
|
|
22
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "hard-ambiguous-request",
|
|
3
|
+
"description": "User gives a vague save command with no clear content — model should ask for clarification",
|
|
4
|
+
"input": "Guardate esto para después",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{ "type": "response_matches", "pattern": "(?i)(qu[eé]|what|cu[aá]l|especific|clarif|decime|contame|refer|exactamente|guardar)" }
|
|
7
|
+
],
|
|
8
|
+
"runs": 1,
|
|
9
|
+
"pass_threshold": 1.0,
|
|
10
|
+
"tags": ["ambiguity", "clarification"],
|
|
11
|
+
"difficulty": "hard"
|
|
12
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "hard-complex-note",
|
|
3
|
+
"description": "User describes a conversation with multiple perspectives and an action item — note should capture all of it",
|
|
4
|
+
"input": "Ayer hablé con Laura del tema de migrar a Kubernetes. Ella dice que no vale la pena para nuestro scale, yo creo que sí. Quedamos en revisar los números la semana que viene.",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
7
|
+
{ "type": "param_match", "tool": "vault_write_note", "key": "type", "pattern": "decision|insight|meeting|project" },
|
|
8
|
+
{ "type": "vault_note_created", "pattern": "(?i)laura" },
|
|
9
|
+
{ "type": "vault_note_created", "pattern": "(?i)kubernetes|k8s" },
|
|
10
|
+
{ "type": "vault_note_created", "pattern": "(?i)(no vale la pena|not worth|scale)" },
|
|
11
|
+
{ "type": "vault_note_created", "pattern": "(?i)(revisar|review|números|numbers|semana)" }
|
|
12
|
+
],
|
|
13
|
+
"runs": 1,
|
|
14
|
+
"pass_threshold": 1.0,
|
|
15
|
+
"tags": ["tool-calling", "vault_write_note", "complex-content"],
|
|
16
|
+
"difficulty": "hard"
|
|
17
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "hard-synthesize-knowledge",
|
|
3
|
+
"description": "Multi-step: save two person notes, then ask a broad question that requires searching and synthesizing both",
|
|
4
|
+
"steps": [
|
|
5
|
+
{
|
|
6
|
+
"input": "Acordate que Martín es diseñador UX y trabaja en Mercado Libre",
|
|
7
|
+
"assertions": [
|
|
8
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
9
|
+
{ "type": "vault_note_created", "pattern": "(?i)mart[ií]n" }
|
|
10
|
+
]
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"input": "Guardá que Sofía es data scientist en Globant y la conozco del secundario",
|
|
14
|
+
"assertions": [
|
|
15
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
16
|
+
{ "type": "vault_note_created", "pattern": "(?i)sof[ií]a" }
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"input": "Qué sabes de las personas que conozco?",
|
|
21
|
+
"assertions": [
|
|
22
|
+
{ "type": "tool_called", "tool": "vault_search" },
|
|
23
|
+
{ "type": "response_matches", "pattern": "(?i)mart[ií]n" },
|
|
24
|
+
{ "type": "response_matches", "pattern": "(?i)sof[ií]a" },
|
|
25
|
+
{ "type": "response_matches", "pattern": "(?i)(mercado libre|globant)" }
|
|
26
|
+
]
|
|
27
|
+
}
|
|
28
|
+
],
|
|
29
|
+
"runs": 1,
|
|
30
|
+
"pass_threshold": 1.0,
|
|
31
|
+
"tags": ["multi-step", "vault_write_note", "vault_search", "synthesis"],
|
|
32
|
+
"difficulty": "hard"
|
|
33
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "medium-note-type-inference",
|
|
3
|
+
"description": "User describes a team decision — the note type should be 'decision', not 'fact'",
|
|
4
|
+
"input": "Hoy decidimos con el equipo que vamos a usar PostgreSQL en vez de MongoDB para el proyecto nuevo",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
7
|
+
{ "type": "param_match", "tool": "vault_write_note", "key": "type", "pattern": "decision" },
|
|
8
|
+
{ "type": "vault_note_created", "pattern": "(?i)postgresql|postgres" },
|
|
9
|
+
{ "type": "vault_note_created", "pattern": "(?i)mongodb|mongo" },
|
|
10
|
+
{ "type": "response_matches", "pattern": "(?i)(guardé|guardado|anotado|decisión|decision)" }
|
|
11
|
+
],
|
|
12
|
+
"runs": 1,
|
|
13
|
+
"pass_threshold": 1.0,
|
|
14
|
+
"tags": ["tool-calling", "vault_write_note", "type-inference"],
|
|
15
|
+
"difficulty": "medium"
|
|
16
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "medium-person-multiple-facts",
|
|
3
|
+
"description": "User mentions a person with multiple facts in one message — should create a person note capturing all details",
|
|
4
|
+
"input": "Mi viejo se llama Carlos, es ingeniero y vive en Córdoba",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
7
|
+
{ "type": "param_match", "tool": "vault_write_note", "key": "type", "pattern": "person" },
|
|
8
|
+
{ "type": "vault_note_created", "pattern": "(?i)carlos" },
|
|
9
|
+
{ "type": "vault_note_created", "pattern": "(?i)ingeniero|engineer" },
|
|
10
|
+
{ "type": "vault_note_created", "pattern": "(?i)c[oó]rdoba" }
|
|
11
|
+
],
|
|
12
|
+
"runs": 1,
|
|
13
|
+
"pass_threshold": 1.0,
|
|
14
|
+
"tags": ["tool-calling", "vault_write_note", "type-inference"],
|
|
15
|
+
"difficulty": "medium"
|
|
16
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "medium-search-implicit",
|
|
3
|
+
"description": "User asks a broad question about people in tech — should search the vault and return relevant results",
|
|
4
|
+
"input": "Qué sabes sobre la gente que trabaja en tech?",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{ "type": "tool_called", "tool": "vault_search" },
|
|
7
|
+
{ "type": "response_matches", "pattern": "(?i)(no encontr|no tengo|no hay|nothing|google|engineer|ML|machine learning|birthday|cumpleaños)" }
|
|
8
|
+
],
|
|
9
|
+
"runs": 1,
|
|
10
|
+
"pass_threshold": 1.0,
|
|
11
|
+
"tags": ["tool-calling", "vault_search", "retrieval"],
|
|
12
|
+
"difficulty": "medium"
|
|
13
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "multi-step-remember-and-search",
|
|
3
|
+
"description": "Two-message flow: save a note, then search for it",
|
|
4
|
+
"steps": [
|
|
5
|
+
{
|
|
6
|
+
"input": "Recordame que Alice trabaja en Google como ML engineer",
|
|
7
|
+
"assertions": [
|
|
8
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
9
|
+
{ "type": "param_match", "tool": "vault_write_note", "key": "type", "pattern": "person" }
|
|
10
|
+
]
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"input": "Que sabes sobre Alice?",
|
|
14
|
+
"assertions": [
|
|
15
|
+
{ "type": "tool_called", "tool": "vault_search" },
|
|
16
|
+
{ "type": "response_matches", "pattern": "(?i)(google|ML|machine learning)" }
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
],
|
|
20
|
+
"runs": 1,
|
|
21
|
+
"pass_threshold": 1.0,
|
|
22
|
+
"tags": ["multi-step", "vault_write_note", "vault_search"],
|
|
23
|
+
"difficulty": "easy"
|
|
24
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "read-note-by-id",
|
|
3
|
+
"description": "User asks to read a specific note (tests vault_read tool selection)",
|
|
4
|
+
"input": "Lee la nota eval-seed-birthday",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"type": "tool_called",
|
|
8
|
+
"tool": "vault_read"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"type": "response_matches",
|
|
12
|
+
"pattern": "(?i)(march 15|tomas|birthday)"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"runs": 1,
|
|
16
|
+
"pass_threshold": 1.0,
|
|
17
|
+
"tags": [
|
|
18
|
+
"tool-calling",
|
|
19
|
+
"vault_read"
|
|
20
|
+
],
|
|
21
|
+
"difficulty": "easy"
|
|
22
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "remember-fact",
|
|
3
|
+
"description": "User asks Limbo to remember a personal fact (tests vault_write_note)",
|
|
4
|
+
"input": "Acordate que soy alérgico al maní",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{ "type": "tool_called", "tool": "vault_write_note" },
|
|
7
|
+
{ "type": "param_match", "tool": "vault_write_note", "key": "type", "pattern": "fact|preference" },
|
|
8
|
+
{ "type": "vault_note_created", "pattern": "alerg|mani|peanut" },
|
|
9
|
+
{ "type": "response_matches", "pattern": "(?i)(guardé|guardado|saved|anotado|alérgico|maní)" }
|
|
10
|
+
],
|
|
11
|
+
"runs": 1,
|
|
12
|
+
"pass_threshold": 1.0,
|
|
13
|
+
"tags": ["tool-calling", "vault_write_note"],
|
|
14
|
+
"difficulty": "easy"
|
|
15
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "reminder-timezone",
|
|
3
|
+
"description": "Reminder should be created in the user's timezone, not UTC",
|
|
4
|
+
"input": "Poneme un reminder para hoy a las 23:00 que tengo que tomar la pastilla",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"type": "cron_created",
|
|
8
|
+
"pattern": "pastilla|pill|medicamento"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"type": "response_matches",
|
|
12
|
+
"pattern": "(?i)(23:00|23hs|11.*pm|reminder|recordatorio|programado)"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"runs": 1,
|
|
16
|
+
"pass_threshold": 1.0,
|
|
17
|
+
"tags": [
|
|
18
|
+
"cron",
|
|
19
|
+
"reminder",
|
|
20
|
+
"timezone"
|
|
21
|
+
],
|
|
22
|
+
"difficulty": "easy"
|
|
23
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "search-existing-note",
|
|
3
|
+
"description": "User searches for a pre-seeded note about birthday",
|
|
4
|
+
"input": "Que sabes sobre el cumpleaños de Tomas?",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"type": "tool_called",
|
|
8
|
+
"tool": "vault_search"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"type": "response_matches",
|
|
12
|
+
"pattern": "(?i)(march 15|15 de marzo|marzo)"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "response_matches",
|
|
16
|
+
"pattern": "(?i)(born|birthday|cumpleaños|naci)"
|
|
17
|
+
}
|
|
18
|
+
],
|
|
19
|
+
"runs": 1,
|
|
20
|
+
"pass_threshold": 1.0,
|
|
21
|
+
"tags": [
|
|
22
|
+
"tool-calling",
|
|
23
|
+
"vault_search",
|
|
24
|
+
"retrieval"
|
|
25
|
+
],
|
|
26
|
+
"difficulty": "easy"
|
|
27
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "update-map",
|
|
3
|
+
"description": "User asks Limbo to add a note to a map of content",
|
|
4
|
+
"input": "Agrega la nota eval-seed-birthday al mapa personal-map en la seccion Datos Personales",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"type": "tool_called",
|
|
8
|
+
"tool": "vault_update_map"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"type": "param_match",
|
|
12
|
+
"tool": "vault_update_map",
|
|
13
|
+
"key": "map",
|
|
14
|
+
"pattern": "personal-map"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"type": "vault_file_exists",
|
|
18
|
+
"path": "maps/personal-map.md"
|
|
19
|
+
}
|
|
20
|
+
],
|
|
21
|
+
"runs": 1,
|
|
22
|
+
"pass_threshold": 1.0,
|
|
23
|
+
"tags": [
|
|
24
|
+
"tool-calling",
|
|
25
|
+
"vault_update_map"
|
|
26
|
+
],
|
|
27
|
+
"difficulty": "easy"
|
|
28
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "web-search",
|
|
3
|
+
"description": "User asks a question requiring web search for current information",
|
|
4
|
+
"input": "Buscá en internet cuál es la última versión de Node.js",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"type": "response_matches",
|
|
8
|
+
"pattern": "(?i)(node|nodejs|version|versión|v\\d+)"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"type": "response_matches",
|
|
12
|
+
"pattern": "\\d+\\.\\d+"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"runs": 1,
|
|
16
|
+
"pass_threshold": 1.0,
|
|
17
|
+
"tags": [
|
|
18
|
+
"web-search",
|
|
19
|
+
"brave"
|
|
20
|
+
],
|
|
21
|
+
"difficulty": "easy"
|
|
22
|
+
}
|