@andespindola/brainlink 0.1.0-beta.16 → 0.1.0-beta.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -22,6 +22,8 @@
22
22
  - Added short-lived hybrid search cache with automatic invalidation on index changes.
23
23
  - Added `stats --extended` observability output with storage, quality and latency probes.
24
24
  - Added `docs/QUICKSTART.md` and aligned README/agent docs with the latest CLI/MCP flows.
25
+ - Added middle-out context assembly so chunk selection expands around the strongest note chunk.
26
+ - Added compressed-space pack prefiltering (token bloom index) before `.blpk` decryption and scan.
25
27
 
26
28
  ## 0.1.0-beta.3
27
29
 
package/README.md CHANGED
@@ -67,8 +67,9 @@ Legacy `.jsonl.gz` packs are upgraded to `.blpk` automatically on first search/c
67
67
  - Obsidian-compatible `[[wiki links]]` and `#tags`.
68
68
  - Weighted graph edges so agents can rank relationship importance and priority.
69
69
  - Backlinks, broken-link reports, orphan detection and validation.
70
- - Full-text, semantic and hybrid retrieval modes.
71
70
  - Full-text, semantic and hybrid retrieval on a local file index.
71
+ - Middle-out context assembly around the strongest chunk per document.
72
+ - Compressed-space prefiltering for `.blpk` packs before decryption and scan.
72
73
  - Agent namespaces under `agents/<agent-id>/`.
73
74
  - S3-compatible bucket vaults through `s3://bucket/prefix` URIs.
74
75
  - CLI with machine-readable `--json` output.
@@ -728,6 +729,7 @@ Modes:
728
729
  - `semantic`: local deterministic embedding similarity only.
729
730
 
730
731
  Hybrid results are cached in-memory for a short TTL and invalidated automatically when the local index file changes.
732
+ Context selection uses a middle-out strategy: it starts from the strongest chunk in a note and expands to neighboring chunks while respecting token budget.
731
733
 
732
734
  ### `context`
733
735
 
@@ -1,13 +1,50 @@
1
+ import { middleOutIndices } from './middle-out.js';
2
+ const maxSectionsPerDocument = 3;
3
+ const byScore = (left, right) => right.score - left.score || left.title.localeCompare(right.title);
4
+ const byOrdinal = (left, right) => (left.chunkOrdinal ?? Number.MAX_SAFE_INTEGER) - (right.chunkOrdinal ?? Number.MAX_SAFE_INTEGER);
5
+ const middleOutDocumentResults = (results) => {
6
+ if (results.length <= 1) {
7
+ return results;
8
+ }
9
+ const sortedByOrdinal = [...results].sort(byOrdinal);
10
+ const pivotChunkId = [...results].sort(byScore)[0]?.chunkId;
11
+ const pivotIndex = sortedByOrdinal.findIndex((result) => result.chunkId === pivotChunkId);
12
+ if (pivotIndex < 0) {
13
+ return [...results].sort(byScore);
14
+ }
15
+ return middleOutIndices(sortedByOrdinal.length, pivotIndex).map((index) => sortedByOrdinal[index]);
16
+ };
1
17
  export const selectContextSections = (results, maxTokens) => {
2
- const selected = results.reduce((state, result) => {
3
- const tokenCost = Math.ceil(result.content.length / 4);
4
- if (state.usedTokens + tokenCost > maxTokens || state.seenDocuments.has(result.documentId)) {
5
- return state;
18
+ const grouped = results.reduce((state, result) => {
19
+ const current = state.get(result.documentId) ?? [];
20
+ state.set(result.documentId, [...current, result]);
21
+ return state;
22
+ }, new Map());
23
+ const documentOrder = Array.from(results.reduce((state, result) => {
24
+ if (!state.has(result.documentId)) {
25
+ state.set(result.documentId, result.score);
6
26
  }
7
- return {
8
- usedTokens: state.usedTokens + tokenCost,
9
- sections: [
10
- ...state.sections,
27
+ return state;
28
+ }, new Map()).entries())
29
+ .sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
30
+ .map(([documentId]) => documentId);
31
+ const selected = documentOrder.reduce((state, documentId) => {
32
+ const ordered = middleOutDocumentResults(grouped.get(documentId) ?? []);
33
+ let usedTokens = state.usedTokens;
34
+ let sections = state.sections;
35
+ let seenChunks = state.seenChunks;
36
+ for (let index = 0; index < ordered.length && index < maxSectionsPerDocument; index += 1) {
37
+ const result = ordered[index];
38
+ if (seenChunks.has(result.chunkId)) {
39
+ continue;
40
+ }
41
+ const tokenCost = Math.ceil(result.content.length / 4);
42
+ if (usedTokens + tokenCost > maxTokens) {
43
+ break;
44
+ }
45
+ usedTokens += tokenCost;
46
+ sections = [
47
+ ...sections,
11
48
  {
12
49
  title: result.title,
13
50
  path: result.path,
@@ -16,13 +53,18 @@ export const selectContextSections = (results, maxTokens) => {
16
53
  searchMode: result.searchMode,
17
54
  tags: result.tags
18
55
  }
19
- ],
20
- seenDocuments: new Set([...state.seenDocuments, result.documentId])
56
+ ];
57
+ seenChunks = new Set([...seenChunks, result.chunkId]);
58
+ }
59
+ return {
60
+ usedTokens,
61
+ sections,
62
+ seenChunks
21
63
  };
22
64
  }, {
23
65
  usedTokens: 0,
24
66
  sections: [],
25
- seenDocuments: new Set()
67
+ seenChunks: new Set()
26
68
  });
27
69
  return selected.sections;
28
70
  };
@@ -0,0 +1,18 @@
1
+ export const middleOutIndices = (size, pivotIndex) => {
2
+ if (!Number.isFinite(size) || size <= 0) {
3
+ return [];
4
+ }
5
+ const clampedPivot = Math.max(0, Math.min(Math.floor(pivotIndex), size - 1));
6
+ const indices = [clampedPivot];
7
+ for (let offset = 1; indices.length < size; offset += 1) {
8
+ const left = clampedPivot - offset;
9
+ const right = clampedPivot + offset;
10
+ if (left >= 0) {
11
+ indices.push(left);
12
+ }
13
+ if (right < size) {
14
+ indices.push(right);
15
+ }
16
+ }
17
+ return indices;
18
+ };
@@ -81,6 +81,7 @@ const toResult = (row, mode, text, semantic) => {
81
81
  title: row.title,
82
82
  path: row.path,
83
83
  chunkId: row.chunkId,
84
+ chunkOrdinal: row.chunkOrdinal,
84
85
  content: row.content,
85
86
  score,
86
87
  textScore: text,
@@ -138,6 +139,7 @@ export const openFileIndex = (vaultPath) => {
138
139
  title: document.title,
139
140
  path: document.path,
140
141
  chunkId: chunk.id,
142
+ chunkOrdinal: chunk.ordinal,
141
143
  content: chunk.content,
142
144
  tags: document.tags,
143
145
  embedding: chunk.embedding
@@ -263,6 +265,7 @@ export const openFileIndex = (vaultPath) => {
263
265
  title: document.title,
264
266
  path: document.path,
265
267
  chunkId: document.id,
268
+ chunkOrdinal: 0,
266
269
  content: document.content,
267
270
  tags: document.tags,
268
271
  embedding: []
@@ -1,11 +1,15 @@
1
1
  import { gunzipSync } from 'node:zlib';
2
2
  import { mkdir, readdir, readFile, rm, writeFile } from 'node:fs/promises';
3
3
  import { join } from 'node:path';
4
+ import { middleOutIndices } from '../domain/middle-out.js';
4
5
  import { decodePrivatePack, encodePrivatePack, isPrivatePackPayload } from './private-pack-codec.js';
5
6
  const packsDirectoryName = 'search-packs';
6
7
  const manifestFileName = 'manifest.json';
7
8
  const rowChunkSize = 5_000;
8
9
  const queryTokenPattern = /[\p{L}\p{N}_-]+/gu;
10
+ const bloomBytes = 256;
11
+ const bloomBitSize = bloomBytes * 8;
12
+ const bloomSeeds = [0x9e3779b1, 0x85ebca6b, 0xc2b2ae35];
9
13
  const toPackDirectory = (vaultPath) => join(vaultPath, '.brainlink', packsDirectoryName);
10
14
  const toManifestPath = (vaultPath) => join(toPackDirectory(vaultPath), manifestFileName);
11
15
  const parseRowsFromPack = async (vaultPath, content) => {
@@ -15,7 +19,29 @@ const parseRowsFromPack = async (vaultPath, content) => {
15
19
  .split('\n')
16
20
  .map((line) => line.trim())
17
21
  .filter((line) => line.length > 0)
18
- .map((line) => JSON.parse(line));
22
+ .map((line) => JSON.parse(line))
23
+ .flatMap((row) => {
24
+ if (typeof row.documentId !== 'string' ||
25
+ typeof row.agentId !== 'string' ||
26
+ typeof row.title !== 'string' ||
27
+ typeof row.path !== 'string' ||
28
+ typeof row.chunkId !== 'string' ||
29
+ typeof row.content !== 'string') {
30
+ return [];
31
+ }
32
+ return [
33
+ {
34
+ documentId: row.documentId,
35
+ agentId: row.agentId,
36
+ title: row.title,
37
+ path: row.path,
38
+ chunkId: row.chunkId,
39
+ chunkOrdinal: typeof row.chunkOrdinal === 'number' ? row.chunkOrdinal : 0,
40
+ content: row.content,
41
+ tags: Array.isArray(row.tags) ? row.tags.filter((item) => typeof item === 'string') : []
42
+ }
43
+ ];
44
+ });
19
45
  };
20
46
  const toRows = (documents) => documents.flatMap((document) => document.chunks.map((chunk) => ({
21
47
  documentId: document.document.id,
@@ -23,12 +49,60 @@ const toRows = (documents) => documents.flatMap((document) => document.chunks.ma
23
49
  title: document.document.title,
24
50
  path: document.document.path,
25
51
  chunkId: chunk.id,
52
+ chunkOrdinal: chunk.ordinal,
26
53
  content: chunk.content,
27
54
  tags: document.document.tags
28
55
  })));
29
56
  const writeManifest = async (vaultPath, manifest) => {
30
57
  await writeFile(toManifestPath(vaultPath), `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
31
58
  };
59
+ const readManifest = async (vaultPath) => {
60
+ try {
61
+ const parsed = JSON.parse(await readFile(toManifestPath(vaultPath), 'utf8'));
62
+ if (parsed.version === 2 && parsed.format === 'private-v2') {
63
+ return {
64
+ version: 2,
65
+ createdAt: typeof parsed.createdAt === 'string' ? parsed.createdAt : new Date().toISOString(),
66
+ packCount: typeof parsed.packCount === 'number' ? parsed.packCount : 0,
67
+ recordCount: typeof parsed.recordCount === 'number' ? parsed.recordCount : 0,
68
+ format: 'private-v2'
69
+ };
70
+ }
71
+ if (parsed.version === 3 && parsed.format === 'private-v2') {
72
+ const packIndex = Array.isArray(parsed.packIndex)
73
+ ? parsed.packIndex.flatMap((entry) => {
74
+ if (!entry || typeof entry !== 'object') {
75
+ return [];
76
+ }
77
+ const candidate = entry;
78
+ if (typeof candidate.fileName !== 'string' || typeof candidate.tokenBloomB64 !== 'string') {
79
+ return [];
80
+ }
81
+ return [
82
+ {
83
+ fileName: candidate.fileName,
84
+ recordCount: typeof candidate.recordCount === 'number' ? candidate.recordCount : 0,
85
+ agents: Array.isArray(candidate.agents) ? candidate.agents.filter((item) => typeof item === 'string') : [],
86
+ tokenBloomB64: candidate.tokenBloomB64
87
+ }
88
+ ];
89
+ })
90
+ : [];
91
+ return {
92
+ version: 3,
93
+ createdAt: typeof parsed.createdAt === 'string' ? parsed.createdAt : new Date().toISOString(),
94
+ packCount: typeof parsed.packCount === 'number' ? parsed.packCount : packIndex.length,
95
+ recordCount: typeof parsed.recordCount === 'number' ? parsed.recordCount : 0,
96
+ format: 'private-v2',
97
+ packIndex
98
+ };
99
+ }
100
+ return null;
101
+ }
102
+ catch {
103
+ return null;
104
+ }
105
+ };
32
106
  const chunkRows = (rows, size) => {
33
107
  const chunks = [];
34
108
  for (let index = 0; index < rows.length; index += size) {
@@ -57,6 +131,45 @@ const countOccurrences = (text, token) => {
57
131
  }
58
132
  return hits;
59
133
  };
134
+ const hashToken = (token, seed) => {
135
+ let hash = seed >>> 0;
136
+ for (let index = 0; index < token.length; index += 1) {
137
+ hash ^= token.charCodeAt(index);
138
+ hash = Math.imul(hash, 16777619) >>> 0;
139
+ }
140
+ return hash >>> 0;
141
+ };
142
+ const createBloom = () => new Uint8Array(bloomBytes);
143
+ const bloomAdd = (bloom, token) => {
144
+ bloomSeeds.forEach((seed) => {
145
+ const bit = hashToken(token, seed) % bloomBitSize;
146
+ bloom[Math.floor(bit / 8)] |= 1 << (bit % 8);
147
+ });
148
+ };
149
+ const bloomMayContain = (bloom, token) => bloomSeeds.every((seed) => {
150
+ const bit = hashToken(token, seed) % bloomBitSize;
151
+ return (bloom[Math.floor(bit / 8)] & (1 << (bit % 8))) !== 0;
152
+ });
153
+ const bloomFromRows = (rows) => {
154
+ const bloom = createBloom();
155
+ rows.forEach((row) => {
156
+ tokenize([row.title, row.path, row.tags.join(' '), row.content].join(' ')).forEach((token) => bloomAdd(bloom, token));
157
+ });
158
+ return bloom;
159
+ };
160
+ const bloomToBase64 = (bloom) => Buffer.from(bloom).toString('base64url');
161
+ const bloomFromBase64 = (value) => {
162
+ try {
163
+ const decoded = Buffer.from(value, 'base64url');
164
+ if (decoded.byteLength === bloomBytes) {
165
+ return new Uint8Array(decoded);
166
+ }
167
+ }
168
+ catch {
169
+ // fallback below
170
+ }
171
+ return createBloom();
172
+ };
60
173
  const computeTextScore = (row, tokens) => {
61
174
  if (tokens.length === 0) {
62
175
  return 0;
@@ -79,6 +192,7 @@ const toSearchResult = (row, score) => ({
79
192
  title: row.title,
80
193
  path: row.path,
81
194
  chunkId: row.chunkId,
195
+ chunkOrdinal: row.chunkOrdinal,
82
196
  content: row.content,
83
197
  score,
84
198
  textScore: score,
@@ -110,24 +224,55 @@ const writeRowsAsPrivatePacks = async (vaultPath, rows, clearExisting) => {
110
224
  .map((name) => rm(join(directory, name), { force: true })));
111
225
  }
112
226
  const chunks = chunkRows(rows, rowChunkSize);
113
- await Promise.all(chunks.map(async (chunk, index) => {
227
+ const packIndex = await Promise.all(chunks.map(async (chunk, index) => {
114
228
  const fileName = `pack-${String(index + 1).padStart(4, '0')}.blpk`;
115
229
  const serialized = `${chunk.map((row) => JSON.stringify(row)).join('\n')}\n`;
116
230
  const compressed = await encodePrivatePack(vaultPath, Buffer.from(serialized, 'utf8'));
231
+ const tokenBloomB64 = bloomToBase64(bloomFromRows(chunk));
117
232
  await writeFile(join(directory, fileName), compressed);
233
+ return {
234
+ fileName,
235
+ recordCount: chunk.length,
236
+ agents: Array.from(new Set(chunk.map((row) => row.agentId))).sort((left, right) => left.localeCompare(right)),
237
+ tokenBloomB64
238
+ };
118
239
  }));
119
240
  await writeManifest(vaultPath, {
120
- version: 2,
241
+ version: 3,
121
242
  createdAt: new Date().toISOString(),
122
243
  packCount: chunks.length,
123
244
  recordCount: rows.length,
124
- format: 'private-v2'
245
+ format: 'private-v2',
246
+ packIndex
125
247
  });
126
248
  return {
127
249
  packCount: chunks.length,
128
250
  recordCount: rows.length
129
251
  };
130
252
  };
253
+ const selectCandidatePackFiles = async (vaultPath, tokens, agentId) => {
254
+ const allFiles = await sortedPackFiles(vaultPath);
255
+ if (allFiles.length === 0) {
256
+ return [];
257
+ }
258
+ const manifest = await readManifest(vaultPath);
259
+ if (!manifest || manifest.version !== 3 || !Array.isArray(manifest.packIndex)) {
260
+ return allFiles;
261
+ }
262
+ const normalizedAgent = agentId?.trim();
263
+ const byAgent = manifest.packIndex.filter((entry) => normalizedAgent ? entry.agents.includes(normalizedAgent) : true);
264
+ if (tokens.length === 0) {
265
+ return byAgent.map((entry) => entry.fileName);
266
+ }
267
+ const byToken = byAgent.filter((entry) => {
268
+ const bloom = bloomFromBase64(entry.tokenBloomB64);
269
+ return tokens.some((token) => bloomMayContain(bloom, token));
270
+ });
271
+ if (byToken.length > 0) {
272
+ return byToken.map((entry) => entry.fileName);
273
+ }
274
+ return byAgent.length > 0 ? byAgent.map((entry) => entry.fileName) : allFiles;
275
+ };
131
276
  export const buildSearchPacks = async (vaultPath, documents) => {
132
277
  return writeRowsAsPrivatePacks(vaultPath, toRows(documents), true);
133
278
  };
@@ -158,14 +303,19 @@ export const searchInPacks = async (vaultPath, query, limit, agentId) => {
158
303
  if (limit <= 0 || tokens.length === 0) {
159
304
  return [];
160
305
  }
161
- const files = await sortedPackFiles(vaultPath);
306
+ const files = await selectCandidatePackFiles(vaultPath, tokens, normalizedAgent);
162
307
  if (files.length === 0) {
163
308
  return [];
164
309
  }
165
310
  const scored = [];
166
311
  for (const file of files) {
167
312
  const rows = await parseRowsFromPack(vaultPath, await readFile(join(toPackDirectory(vaultPath), file)));
168
- rows.forEach((row) => {
313
+ const traversal = middleOutIndices(rows.length, Math.floor(rows.length / 2));
314
+ traversal.forEach((rowIndex) => {
315
+ const row = rows[rowIndex];
316
+ if (!row) {
317
+ return;
318
+ }
169
319
  if (normalizedAgent && row.agentId !== normalizedAgent) {
170
320
  return;
171
321
  }
@@ -465,6 +465,7 @@ Search modes:
465
465
  - `semantic`: local deterministic embedding similarity.
466
466
 
467
467
  Hybrid results are cached in-memory for a short TTL and invalidated when `.brainlink/index.json` changes.
468
+ Context assembly uses middle-out ordering inside each note: the highest-scoring chunk is selected first, then nearby chunks are expanded while token budget allows.
468
469
 
469
470
  ### Build Agent Context
470
471
 
@@ -635,6 +636,7 @@ GET /api/validate
635
636
  The HTTP API is read-only. Use the CLI for writes and indexing.
636
637
 
637
638
  Indexing writes private encrypted search packs at `.brainlink/search-packs/*.blpk` for resilient retrieval and portability.
639
+ Pack search now uses compressed-space prefiltering (token bloom index per pack) before decrypting/reading pack payloads.
638
640
  Pack decryption keys are resolved from `$BRAINLINK_HOME/keys` (or `BRAINLINK_SEARCH_PACK_KEY` when explicitly set).
639
641
 
640
642
  ## Agent Integration Contract
@@ -138,8 +138,10 @@ read markdown files
138
138
  question
139
139
  -> selected mode: fts | semantic | hybrid
140
140
  -> optional query embedding
141
+ -> optional compressed pack prefilter (token bloom)
141
142
  -> lexical scoring and/or semantic cosine scoring
142
143
  -> cosine similarity over candidate chunks
144
+ -> middle-out context expansion around strongest chunk
143
145
  -> ranked chunks with textScore and semanticScore
144
146
  -> token-budget selection
145
147
  -> Markdown context package
@@ -293,6 +295,7 @@ Markdown keeps the system portable, inspectable, Git-friendly, and compatible wi
293
295
  Brainlink uses a local JSON index plus encrypted pack exports for fast rebuildable retrieval without external infrastructure.
294
296
  Hybrid retrieval also uses a short-lived in-memory cache keyed by vault/query/agent and invalidated by index file mtime to reduce repeated query latency.
295
297
  Indexing exports private encrypted pack files (`.brainlink/search-packs/*.blpk`) from indexed chunks for fast retrieval and recovery continuity.
298
+ Pack manifests include compressed-space token bloom metadata so retrieval can skip unrelated packs before decryption.
296
299
  Pack encryption keys are resolved from `$BRAINLINK_HOME/keys` or from `BRAINLINK_SEARCH_PACK_KEY` when configured.
297
300
  Legacy `.jsonl.gz` search packs are auto-upgraded to `.blpk` on first retrieval flow.
298
301
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@andespindola/brainlink",
3
- "version": "0.1.0-beta.16",
3
+ "version": "0.1.0-beta.17",
4
4
  "description": "Local-first knowledge memory for agents with Markdown, backlinks, indexing and context retrieval.",
5
5
  "type": "module",
6
6
  "license": "MIT",