pixmap-engine 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/README.md +103 -0
- package/dist/embedder.js +75 -0
- package/dist/engine.js +66 -0
- package/dist/index.js +259 -0
- package/dist/indexer.js +10 -0
- package/dist/metadataDb.js +69 -0
- package/dist/preview.js +216 -0
- package/dist/searcher.js +17 -0
- package/dist/types.js +3 -0
- package/dist/vectorStore.js +47 -0
- package/docs/HOW.md +177 -0
- package/package.json +33 -0
- package/src/embedder.ts +100 -0
- package/src/engine.ts +106 -0
- package/src/index.ts +310 -0
- package/src/indexer.ts +21 -0
- package/src/metadataDb.ts +87 -0
- package/src/preview.ts +292 -0
- package/src/searcher.ts +30 -0
- package/src/types.ts +19 -0
- package/src/vectorStore.ts +59 -0
- package/tsconfig.json +16 -0
package/dist/preview.js
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import sharp from 'sharp';
|
|
2
|
+
function detectProtocol() {
|
|
3
|
+
const term = process.env.TERM_PROGRAM ?? '';
|
|
4
|
+
const termEnv = process.env.TERM ?? '';
|
|
5
|
+
// iTerm2, WezTerm, Hyper, Tabby, mintty all support iTerm2 inline images
|
|
6
|
+
if (term === 'iTerm.app' ||
|
|
7
|
+
term === 'WezTerm' ||
|
|
8
|
+
term === 'Hyper' ||
|
|
9
|
+
term === 'Tabby' ||
|
|
10
|
+
process.env.WEZTERM_PANE != null ||
|
|
11
|
+
process.env.MINTTY_SHORTCUT != null) {
|
|
12
|
+
return 'iterm';
|
|
13
|
+
}
|
|
14
|
+
// Kitty
|
|
15
|
+
if (term === 'kitty' || process.env.KITTY_PID != null) {
|
|
16
|
+
return 'kitty';
|
|
17
|
+
}
|
|
18
|
+
// VS Code terminal supports iTerm2 protocol
|
|
19
|
+
if (term === 'vscode') {
|
|
20
|
+
return 'iterm';
|
|
21
|
+
}
|
|
22
|
+
// Sixel support (xterm with sixel, foot, mlterm)
|
|
23
|
+
if (termEnv.includes('xterm') && process.env.SIXEL_SUPPORT === '1') {
|
|
24
|
+
return 'sixel';
|
|
25
|
+
}
|
|
26
|
+
return 'halfblock';
|
|
27
|
+
}
|
|
28
|
+
/** Get usable terminal width */
|
|
29
|
+
function getTermCols() {
|
|
30
|
+
return (process.stdout.columns || 120) - 4;
|
|
31
|
+
}
|
|
32
|
+
// ── iTerm2 inline image protocol ────────────────────────────────────
|
|
33
|
+
async function renderIterm(imagePath, widthCols, heightRows) {
|
|
34
|
+
// Resize with sharp first to control dimensions
|
|
35
|
+
const meta = await sharp(imagePath).metadata();
|
|
36
|
+
const origW = meta.width ?? 400;
|
|
37
|
+
const origH = meta.height ?? 400;
|
|
38
|
+
// Each terminal column ≈ 8px, each row ≈ 16px (common defaults)
|
|
39
|
+
// We target pixel dimensions for good quality
|
|
40
|
+
const targetPxW = widthCols * 8;
|
|
41
|
+
let targetPxH = Math.round(targetPxW * (origH / origW));
|
|
42
|
+
if (heightRows) {
|
|
43
|
+
const maxPxH = heightRows * 16;
|
|
44
|
+
if (targetPxH > maxPxH) {
|
|
45
|
+
targetPxH = maxPxH;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
const buf = await sharp(imagePath)
|
|
49
|
+
.resize(targetPxW, targetPxH, { fit: 'inside', kernel: 'lanczos3' })
|
|
50
|
+
.png()
|
|
51
|
+
.toBuffer();
|
|
52
|
+
const b64 = buf.toString('base64');
|
|
53
|
+
const args = `inline=1;width=${widthCols};preserveAspectRatio=1`;
|
|
54
|
+
// OSC 1337 ; File=[args]:[base64 data] ST
|
|
55
|
+
return ` \x1b]1337;File=${args}:${b64}\x07`;
|
|
56
|
+
}
|
|
57
|
+
// ── Kitty graphics protocol ─────────────────────────────────────────
|
|
58
|
+
async function renderKitty(imagePath, widthCols, heightRows) {
|
|
59
|
+
const meta = await sharp(imagePath).metadata();
|
|
60
|
+
const origW = meta.width ?? 400;
|
|
61
|
+
const origH = meta.height ?? 400;
|
|
62
|
+
const targetPxW = widthCols * 8;
|
|
63
|
+
let targetPxH = Math.round(targetPxW * (origH / origW));
|
|
64
|
+
if (heightRows) {
|
|
65
|
+
const maxPxH = heightRows * 16;
|
|
66
|
+
if (targetPxH > maxPxH)
|
|
67
|
+
targetPxH = maxPxH;
|
|
68
|
+
}
|
|
69
|
+
const buf = await sharp(imagePath)
|
|
70
|
+
.resize(targetPxW, targetPxH, { fit: 'inside', kernel: 'lanczos3' })
|
|
71
|
+
.png()
|
|
72
|
+
.toBuffer();
|
|
73
|
+
const b64 = buf.toString('base64');
|
|
74
|
+
const chunks = [];
|
|
75
|
+
const CHUNK_SIZE = 4096;
|
|
76
|
+
for (let i = 0; i < b64.length; i += CHUNK_SIZE) {
|
|
77
|
+
const chunk = b64.slice(i, i + CHUNK_SIZE);
|
|
78
|
+
const more = i + CHUNK_SIZE < b64.length ? 1 : 0;
|
|
79
|
+
if (i === 0) {
|
|
80
|
+
chunks.push(`\x1b_Ga=T,f=100,c=${widthCols},m=${more};${chunk}\x1b\\`);
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
chunks.push(`\x1b_Gm=${more};${chunk}\x1b\\`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return ' ' + chunks.join('');
|
|
87
|
+
}
|
|
88
|
+
// ── Half-block fallback (true-color ANSI) ───────────────────────────
|
|
89
|
+
const UPPER_HALF = '\u2580';
|
|
90
|
+
const RESET = '\x1b[0m';
|
|
91
|
+
function fg(r, g, b) {
|
|
92
|
+
return `\x1b[38;2;${r};${g};${b}m`;
|
|
93
|
+
}
|
|
94
|
+
function bgc(r, g, b) {
|
|
95
|
+
return `\x1b[48;2;${r};${g};${b}m`;
|
|
96
|
+
}
|
|
97
|
+
async function loadPixels(imagePath, cols, maxTermRows) {
|
|
98
|
+
const meta = await sharp(imagePath).metadata();
|
|
99
|
+
const origW = meta.width ?? cols;
|
|
100
|
+
const origH = meta.height ?? cols;
|
|
101
|
+
const aspect = origH / origW;
|
|
102
|
+
let pxW = cols;
|
|
103
|
+
let pxH = Math.round(cols * aspect);
|
|
104
|
+
if (maxTermRows && pxH > maxTermRows * 2) {
|
|
105
|
+
pxH = maxTermRows * 2;
|
|
106
|
+
pxW = Math.round(pxH / aspect);
|
|
107
|
+
}
|
|
108
|
+
pxW = Math.max(pxW, 2);
|
|
109
|
+
pxH = Math.max(pxH, 2);
|
|
110
|
+
// Ensure even height for half-block pairing
|
|
111
|
+
if (pxH % 2 !== 0)
|
|
112
|
+
pxH += 1;
|
|
113
|
+
const { data, info } = await sharp(imagePath)
|
|
114
|
+
.resize(pxW, pxH, { fit: 'fill', kernel: 'lanczos3' })
|
|
115
|
+
.removeAlpha()
|
|
116
|
+
.raw()
|
|
117
|
+
.toBuffer({ resolveWithObject: true });
|
|
118
|
+
const pixels = [];
|
|
119
|
+
for (let y = 0; y < info.height; y++) {
|
|
120
|
+
const row = [];
|
|
121
|
+
for (let x = 0; x < info.width; x++) {
|
|
122
|
+
const i = (y * info.width + x) * 3;
|
|
123
|
+
row.push([data[i], data[i + 1], data[i + 2]]);
|
|
124
|
+
}
|
|
125
|
+
pixels.push(row);
|
|
126
|
+
}
|
|
127
|
+
return { pixels, width: info.width, height: info.height };
|
|
128
|
+
}
|
|
129
|
+
function pixelsToAnsi(pixels, indent = 2) {
|
|
130
|
+
const pad = ' '.repeat(indent);
|
|
131
|
+
const lines = [];
|
|
132
|
+
for (let y = 0; y < pixels.length; y += 2) {
|
|
133
|
+
let line = pad;
|
|
134
|
+
const topRow = pixels[y];
|
|
135
|
+
const botRow = y + 1 < pixels.length ? pixels[y + 1] : null;
|
|
136
|
+
for (let x = 0; x < topRow.length; x++) {
|
|
137
|
+
const [tr, tg, tb] = topRow[x];
|
|
138
|
+
const [br, bg, bb] = botRow ? botRow[x] : [0, 0, 0];
|
|
139
|
+
line += `${fg(tr, tg, tb)}${bgc(br, bg, bb)}${UPPER_HALF}`;
|
|
140
|
+
}
|
|
141
|
+
line += RESET;
|
|
142
|
+
lines.push(line);
|
|
143
|
+
}
|
|
144
|
+
return lines.join('\n');
|
|
145
|
+
}
|
|
146
|
+
async function renderHalfblock(imagePath, cols, maxTermRows) {
|
|
147
|
+
const { pixels } = await loadPixels(imagePath, cols, maxTermRows);
|
|
148
|
+
return pixelsToAnsi(pixels);
|
|
149
|
+
}
|
|
150
|
+
// ── Public API ──────────────────────────────────────────────────────
|
|
151
|
+
const protocol = detectProtocol();
|
|
152
|
+
/** Render a single image to terminal string */
|
|
153
|
+
export async function renderImage(imagePath, cols, maxTermRows) {
|
|
154
|
+
const w = cols ?? getTermCols();
|
|
155
|
+
switch (protocol) {
|
|
156
|
+
case 'iterm':
|
|
157
|
+
return renderIterm(imagePath, w, maxTermRows);
|
|
158
|
+
case 'kitty':
|
|
159
|
+
return renderKitty(imagePath, w, maxTermRows);
|
|
160
|
+
default:
|
|
161
|
+
return renderHalfblock(imagePath, w, maxTermRows);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
/** Render multiple images side-by-side in a row with labels on top */
|
|
165
|
+
export async function renderImageRow(panels, panelCols, maxPanelRows = 24, gap = 2) {
|
|
166
|
+
const termW = getTermCols();
|
|
167
|
+
const cols = panelCols ?? Math.floor((termW - gap * (panels.length - 1)) / panels.length);
|
|
168
|
+
// For inline-image protocols, render each image stacked (side-by-side isn't
|
|
169
|
+
// possible with protocol images). Label + image + gap, repeated.
|
|
170
|
+
if (protocol === 'iterm' || protocol === 'kitty') {
|
|
171
|
+
const parts = [];
|
|
172
|
+
for (const panel of panels) {
|
|
173
|
+
parts.push(` ${panel.label}`);
|
|
174
|
+
const img = await renderImage(panel.imagePath, cols, maxPanelRows);
|
|
175
|
+
parts.push(img);
|
|
176
|
+
parts.push('');
|
|
177
|
+
}
|
|
178
|
+
return parts.join('\n');
|
|
179
|
+
}
|
|
180
|
+
// Half-block: true side-by-side rendering
|
|
181
|
+
const allData = await Promise.all(panels.map((p) => loadPixels(p.imagePath, cols, maxPanelRows)));
|
|
182
|
+
const lines = [];
|
|
183
|
+
const indent = ' ';
|
|
184
|
+
const gapStr = ' '.repeat(gap);
|
|
185
|
+
// Labels
|
|
186
|
+
let labelLine = indent;
|
|
187
|
+
for (let i = 0; i < panels.length; i++) {
|
|
188
|
+
const label = panels[i].label;
|
|
189
|
+
const padded = label.length > cols ? label.slice(0, cols - 1) + '\u2026' : label.padEnd(cols);
|
|
190
|
+
labelLine += padded;
|
|
191
|
+
if (i < panels.length - 1)
|
|
192
|
+
labelLine += gapStr;
|
|
193
|
+
}
|
|
194
|
+
lines.push(labelLine);
|
|
195
|
+
const maxPxRows = Math.max(...allData.map((d) => d.height));
|
|
196
|
+
for (let y = 0; y < maxPxRows; y += 2) {
|
|
197
|
+
let line = indent;
|
|
198
|
+
for (let gi = 0; gi < allData.length; gi++) {
|
|
199
|
+
const { pixels, width } = allData[gi];
|
|
200
|
+
for (let x = 0; x < width; x++) {
|
|
201
|
+
const [tr, tg, tb] = y < pixels.length ? pixels[y][x] : [0, 0, 0];
|
|
202
|
+
const [br, bg, bb] = y + 1 < pixels.length ? pixels[y + 1][x] : [0, 0, 0];
|
|
203
|
+
line += `${fg(tr, tg, tb)}${bgc(br, bg, bb)}${UPPER_HALF}`;
|
|
204
|
+
}
|
|
205
|
+
line += RESET;
|
|
206
|
+
// Pad shorter panels
|
|
207
|
+
const padN = cols - width;
|
|
208
|
+
if (padN > 0)
|
|
209
|
+
line += ' '.repeat(padN);
|
|
210
|
+
if (gi < allData.length - 1)
|
|
211
|
+
line += gapStr;
|
|
212
|
+
}
|
|
213
|
+
lines.push(line);
|
|
214
|
+
}
|
|
215
|
+
return lines.join('\n');
|
|
216
|
+
}
|
package/dist/searcher.js
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { DEFAULT_TOP_K } from './types.js';
|
|
2
|
+
export async function findSimilar(queryImagePath, embedder, store, db, topK = DEFAULT_TOP_K) {
|
|
3
|
+
const queryVec = await embedder.embed(queryImagePath);
|
|
4
|
+
const hits = store.search(queryVec, topK);
|
|
5
|
+
const results = [];
|
|
6
|
+
for (const hit of hits) {
|
|
7
|
+
const metadata = db.get(hit.id);
|
|
8
|
+
if (!metadata) {
|
|
9
|
+
continue;
|
|
10
|
+
}
|
|
11
|
+
results.push({
|
|
12
|
+
...metadata,
|
|
13
|
+
score: hit.score,
|
|
14
|
+
});
|
|
15
|
+
}
|
|
16
|
+
return results;
|
|
17
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import hnswlib from 'hnswlib-node';
|
|
3
|
+
import { DEFAULT_MAX_ELEMENTS, VECTOR_DIMENSIONS } from './types.js';
|
|
4
|
+
const { HierarchicalNSW } = hnswlib;
|
|
5
|
+
export class VectorStore {
|
|
6
|
+
index;
|
|
7
|
+
dim;
|
|
8
|
+
maxElements;
|
|
9
|
+
constructor(dim = VECTOR_DIMENSIONS, maxElements = DEFAULT_MAX_ELEMENTS) {
|
|
10
|
+
this.dim = dim;
|
|
11
|
+
this.maxElements = maxElements;
|
|
12
|
+
this.index = new HierarchicalNSW('cosine', dim);
|
|
13
|
+
}
|
|
14
|
+
initOrLoad(indexPath) {
|
|
15
|
+
if (fs.existsSync(indexPath)) {
|
|
16
|
+
this.index.readIndexSync(indexPath);
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
this.index.initIndex(this.maxElements);
|
|
20
|
+
}
|
|
21
|
+
add(id, vector) {
|
|
22
|
+
this.ensureCapacity();
|
|
23
|
+
this.index.addPoint(Array.from(vector), id);
|
|
24
|
+
}
|
|
25
|
+
search(queryVector, topK) {
|
|
26
|
+
const result = this.index.searchKnn(Array.from(queryVector), topK);
|
|
27
|
+
return result.neighbors.map((id, i) => ({
|
|
28
|
+
id,
|
|
29
|
+
score: 1 - result.distances[i],
|
|
30
|
+
}));
|
|
31
|
+
}
|
|
32
|
+
save(path) {
|
|
33
|
+
this.index.writeIndex(path);
|
|
34
|
+
}
|
|
35
|
+
getCount() {
|
|
36
|
+
return this.index.getCurrentCount();
|
|
37
|
+
}
|
|
38
|
+
ensureCapacity() {
|
|
39
|
+
const count = this.index.getCurrentCount();
|
|
40
|
+
if (count < this.maxElements) {
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
const next = Math.ceil(this.maxElements * 1.5);
|
|
44
|
+
this.index.resizeIndex(next);
|
|
45
|
+
this.maxElements = next;
|
|
46
|
+
}
|
|
47
|
+
}
|
package/docs/HOW.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# How pixmap works
|
|
2
|
+
|
|
3
|
+
This document walks through the internals — how images get indexed, how search works, and why the stack is what it is.
|
|
4
|
+
|
|
5
|
+
## The pieces
|
|
6
|
+
|
|
7
|
+
Pixmap has a pretty simple architecture. There are three core components and a thin engine that ties them together:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
PixmapEngine (src/engine.ts)
|
|
11
|
+
├── ImageEmbedder (src/embedder.ts) — image → 512-d vector
|
|
12
|
+
├── VectorStore (src/vectorStore.ts) — HNSW index (add/search/save)
|
|
13
|
+
└── MetadataDb (src/metadataDb.ts) — SQLite (paths, ids, timestamps)
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
On top of these, `indexer.ts` and `searcher.ts` are small workflow modules that coordinate the add and search operations respectively. The engine itself just wires everything together and exposes the public API.
|
|
17
|
+
|
|
18
|
+
## Indexing an image
|
|
19
|
+
|
|
20
|
+
When you call `engine.add(imagePath)`, here's what actually happens:
|
|
21
|
+
|
|
22
|
+
**1. Dedup check.** The path is upserted into SQLite. If it already exists with `indexed = 1`, we bail early — no work to do. If it exists with `indexed = 0` (maybe a previous run crashed halfway through), we re-process it.
|
|
23
|
+
|
|
24
|
+
**2. Preprocessing.** Sharp resizes the image to 224x224 pixels using a cover fit (maintains aspect ratio, crops the excess). Alpha channels get stripped. The output is a raw PNG buffer — this is what the CLIP model expects.
|
|
25
|
+
|
|
26
|
+
**3. Embedding.** The preprocessed image is fed through `clip-vit-base-patch32` running in ONNX via `@xenova/transformers`. This produces a 512-dimensional float vector. The vector gets L2-normalized so it sits on the unit hypersphere, which is needed for cosine similarity to work correctly with HNSW.
|
|
27
|
+
|
|
28
|
+
**4. Indexing.** The normalized vector is inserted into the HNSW index with the SQLite row ID as its key. The index is saved to disk immediately.
|
|
29
|
+
|
|
30
|
+
**5. Marking done.** The SQLite row gets `indexed = 1`. This is the commit point — if the process dies before this, the next run will re-process the image.
|
|
31
|
+
|
|
32
|
+
## Searching
|
|
33
|
+
|
|
34
|
+
Search follows the same embedding path, then diverges:
|
|
35
|
+
|
|
36
|
+
1. The query image goes through the same Sharp → CLIP → normalize pipeline.
|
|
37
|
+
2. The resulting vector is handed to HNSW, which does an approximate nearest-neighbor search and returns the top-K vector IDs along with their cosine distances.
|
|
38
|
+
3. Distances are converted to similarity scores (`1 - distance`, so 1.0 = identical).
|
|
39
|
+
4. Each ID is looked up in SQLite to get the file path and timestamp.
|
|
40
|
+
5. Results come back as an array of `{ id, path, score, created, indexed }`.
|
|
41
|
+
|
|
42
|
+
The search itself (step 2) takes under 2ms even with 100K vectors in the index. The bottleneck is always the CLIP embedding step, which runs ~200-500ms on CPU.
|
|
43
|
+
|
|
44
|
+
## Embedding: CLIP in detail
|
|
45
|
+
|
|
46
|
+
### Why CLIP
|
|
47
|
+
|
|
48
|
+
CLIP understands images semantically. Two photos of a dog — one close-up, one from across a park — will have similar embeddings even though their pixels are completely different. This is what makes it useful for "find images like this one" rather than just pixel-matching.
|
|
49
|
+
|
|
50
|
+
We use `clip-vit-base-patch32`, a ViT-B/32 variant. It's about 150MB, runs fine on CPU, and gives 512-d vectors. The model downloads automatically on first run and gets cached in `~/.cache/huggingface/`.
|
|
51
|
+
|
|
52
|
+
### The preprocessing pipeline
|
|
53
|
+
|
|
54
|
+
CLIP needs exactly 224x224x3 input. Here's how we get there:
|
|
55
|
+
|
|
56
|
+
1. **Resize** to 224x224 with cover fit — aspect ratio is preserved, overflow is cropped.
|
|
57
|
+
2. **Strip alpha** — drop down to 3 channels (RGB).
|
|
58
|
+
3. **Export as PNG** — lossless buffer, no compression artifacts.
|
|
59
|
+
4. **Build RawImage** — 150,528 pixel values as a Uint8Array, ready for the model.
|
|
60
|
+
|
|
61
|
+
### Normalization
|
|
62
|
+
|
|
63
|
+
After inference, we normalize the vector to unit length:
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
normalized[i] = v[i] / sqrt(v[0]² + v[1]² + ... + v[511]²)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
This matters because HNSW's cosine distance mode assumes unit vectors. Without normalization, similarity scores would be inconsistent — vectors with larger magnitudes would distort the distance calculations.
|
|
70
|
+
|
|
71
|
+
## Vector index: HNSW
|
|
72
|
+
|
|
73
|
+
### How HNSW works
|
|
74
|
+
|
|
75
|
+
HNSW (Hierarchical Navigable Small World) is a graph-based approximate nearest neighbor algorithm. Think of it like a skip list but in vector space:
|
|
76
|
+
|
|
77
|
+
- The bottom layer has every vector, connected to its closest neighbors.
|
|
78
|
+
- Each layer above has fewer and fewer vectors, with longer-range connections.
|
|
79
|
+
- To search, you start at the top (where there are few nodes but long jumps) and work down, getting more precise at each layer.
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
Layer 2: A ──────────── D (express lanes)
|
|
83
|
+
│ │
|
|
84
|
+
Layer 1: A ── B ── C ── D ── E (mid-range)
|
|
85
|
+
│ │ │ │ │
|
|
86
|
+
Layer 0: A B C D E F G H I (all vectors, local connections)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
This gives O(log n) search instead of O(n) brute force. At 100K images, that's the difference between ~2ms and ~500ms per query.
|
|
90
|
+
|
|
91
|
+
### Configuration
|
|
92
|
+
|
|
93
|
+
The index is set up with:
|
|
94
|
+
|
|
95
|
+
- **512 dimensions** (matching CLIP output)
|
|
96
|
+
- **Cosine distance** (since vectors are normalized)
|
|
97
|
+
- **100K initial capacity** (auto-resizes to 1.5x when full)
|
|
98
|
+
|
|
99
|
+
The resize creates a new index, copies everything over, and swaps it in. Not the fastest operation, but it's a one-time cost and it keeps things simple.
|
|
100
|
+
|
|
101
|
+
### Persistence
|
|
102
|
+
|
|
103
|
+
After every `add()` call, the index is written to `index.hnsw`. On startup, if this file exists, it's loaded back. The format is hnswlib's native binary — it includes the vectors, graph edges, and config in a single file.
|
|
104
|
+
|
|
105
|
+
## Metadata: SQLite
|
|
106
|
+
|
|
107
|
+
### Schema
|
|
108
|
+
|
|
109
|
+
```sql
|
|
110
|
+
CREATE TABLE IF NOT EXISTS images (
|
|
111
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
112
|
+
path TEXT NOT NULL UNIQUE,
|
|
113
|
+
indexed INTEGER NOT NULL DEFAULT 0,
|
|
114
|
+
created INTEGER NOT NULL
|
|
115
|
+
);
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
The `id` column does double duty — it's both the primary key in SQLite and the vector ID in HNSW. When HNSW returns vector ID 42, we look up `images WHERE id = 42` to get the file path. Simple 1:1 mapping, no translation layer needed.
|
|
119
|
+
|
|
120
|
+
The `indexed` column (0 or 1) tracks whether the embedding pipeline completed. The `UNIQUE` constraint on `path` prevents the same file from being indexed twice.
|
|
121
|
+
|
|
122
|
+
### Why SQLite instead of just a JSON file
|
|
123
|
+
|
|
124
|
+
A JSON file would work for small collections, but it falls apart quickly:
|
|
125
|
+
|
|
126
|
+
- No atomic writes — a crash mid-save corrupts the whole thing.
|
|
127
|
+
- No indexed lookups — finding a record by path means scanning the entire file.
|
|
128
|
+
- No concurrent access — reading while writing is a recipe for data loss.
|
|
129
|
+
|
|
130
|
+
SQLite handles all of this out of the box. `better-sqlite3` gives us synchronous calls (no callback juggling in the indexer), and WAL mode means reads don't block writes.
|
|
131
|
+
|
|
132
|
+
## Crash recovery
|
|
133
|
+
|
|
134
|
+
The indexing pipeline is designed so that a crash at any point leaves things in a recoverable state:
|
|
135
|
+
|
|
136
|
+
- **Crash before embedding completes:** SQLite has the row with `indexed = 0`. Next time you add the same path, it picks up where it left off.
|
|
137
|
+
- **Crash after embedding but before HNSW save:** The vector is lost (wasn't persisted). SQLite still shows `indexed = 0`, so the next add re-embeds and re-inserts.
|
|
138
|
+
- **Crash after HNSW save but before marking indexed:** On next run, the image gets re-embedded and the vector is re-inserted into HNSW. HNSW handles duplicate IDs by overwriting, so no corruption.
|
|
139
|
+
- **Crash after everything:** Both stores are consistent. `indexed = 1` prevents reprocessing.
|
|
140
|
+
|
|
141
|
+
There's no transaction spanning both stores — instead, the two-phase design (insert metadata first, mark complete last) naturally handles partial failures.
|
|
142
|
+
|
|
143
|
+
## Initialization
|
|
144
|
+
|
|
145
|
+
When `engine.init()` is called, three things happen:
|
|
146
|
+
|
|
147
|
+
1. The CLIP model loads. First run downloads ~150MB from Hugging Face; subsequent runs use the cache.
|
|
148
|
+
2. The HNSW index loads from `index.hnsw` if it exists, otherwise a fresh empty index is created.
|
|
149
|
+
3. SQLite opens `metadata.db`, running the CREATE TABLE statement if needed (it's idempotent).
|
|
150
|
+
|
|
151
|
+
All three dependencies are loaded via dynamic `import()`. This means you can `import { PixmapEngine } from "pixmap"` without triggering any model downloads or native module loading — that only happens when you call `init()`.
|
|
152
|
+
|
|
153
|
+
There's no explicit shutdown or cleanup. HNSW is saved to disk after each add, SQLite handles its own connection lifecycle, and the ONNX session gets garbage-collected normally.
|
|
154
|
+
|
|
155
|
+
## Performance
|
|
156
|
+
|
|
157
|
+
Some rough numbers to set expectations:
|
|
158
|
+
|
|
159
|
+
**Indexing (per image):**
|
|
160
|
+
|
|
161
|
+
| Step | Time |
|
|
162
|
+
|------|------|
|
|
163
|
+
| Sharp resize | ~20ms |
|
|
164
|
+
| CLIP embedding | ~200-500ms |
|
|
165
|
+
| HNSW insert | <1ms |
|
|
166
|
+
| SQLite upsert | <1ms |
|
|
167
|
+
| HNSW disk save | ~5-50ms |
|
|
168
|
+
|
|
169
|
+
The CLIP step dominates. On a modern laptop, expect around 2-4 images per second.
|
|
170
|
+
|
|
171
|
+
**Search:**
|
|
172
|
+
|
|
173
|
+
The embedding step is the same ~200-500ms. The actual HNSW lookup is under 2ms even at 100K scale. So search latency is essentially just the time to embed the query.
|
|
174
|
+
|
|
175
|
+
**Storage:**
|
|
176
|
+
|
|
177
|
+
Each indexed image costs about 2KB (512 floats for the vector, plus graph edges, plus the SQLite row). At 100K images, you're looking at roughly 220MB for the index and 20MB for the database. The original images aren't copied or stored — pixmap only keeps the vectors and paths.
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pixmap-engine",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "On-device image similarity search",
|
|
6
|
+
"main": "./src/engine.ts",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"import": "./src/engine.ts",
|
|
10
|
+
"default": "./dist/engine.js"
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"bin": {
|
|
14
|
+
"pixmap": "dist/index.js"
|
|
15
|
+
},
|
|
16
|
+
"scripts": {
|
|
17
|
+
"build": "tsc -p tsconfig.json",
|
|
18
|
+
"start": "node dist/index.js",
|
|
19
|
+
"dev": "tsx src/index.ts"
|
|
20
|
+
},
|
|
21
|
+
"dependencies": {
|
|
22
|
+
"@xenova/transformers": "^2.17.2",
|
|
23
|
+
"better-sqlite3": "^11.7.0",
|
|
24
|
+
"hnswlib-node": "^3.0.0",
|
|
25
|
+
"sharp": "^0.33.5"
|
|
26
|
+
},
|
|
27
|
+
"devDependencies": {
|
|
28
|
+
"@types/better-sqlite3": "^7.6.13",
|
|
29
|
+
"@types/node": "^22.10.2",
|
|
30
|
+
"tsx": "^4.19.2",
|
|
31
|
+
"typescript": "^5.7.2"
|
|
32
|
+
}
|
|
33
|
+
}
|
package/src/embedder.ts
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import sharp from 'sharp';
|
|
2
|
+
import { RawImage, pipeline } from '@xenova/transformers';
|
|
3
|
+
import { VECTOR_DIMENSIONS } from './types.js';
|
|
4
|
+
|
|
5
|
+
type Extractor = (input: unknown) => Promise<unknown>;
|
|
6
|
+
|
|
7
|
+
export class ImageEmbedder {
|
|
8
|
+
private extractor: Extractor | null = null;
|
|
9
|
+
|
|
10
|
+
async init(): Promise<void> {
|
|
11
|
+
this.extractor = (await pipeline(
|
|
12
|
+
'image-feature-extraction',
|
|
13
|
+
'Xenova/clip-vit-base-patch32'
|
|
14
|
+
)) as Extractor;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
async embed(imagePath: string): Promise<Float32Array> {
|
|
18
|
+
if (!this.extractor) {
|
|
19
|
+
throw new Error('ImageEmbedder is not initialized. Call init() first.');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const preprocessed = await sharp(imagePath)
|
|
23
|
+
.resize(224, 224, { fit: 'cover' })
|
|
24
|
+
.removeAlpha()
|
|
25
|
+
.toFormat('png')
|
|
26
|
+
.toBuffer();
|
|
27
|
+
|
|
28
|
+
const image = await this.bufferToRawImage(preprocessed);
|
|
29
|
+
const output = await this.extractor(image);
|
|
30
|
+
const vector = this.extractVector(output);
|
|
31
|
+
|
|
32
|
+
if (vector.length !== VECTOR_DIMENSIONS) {
|
|
33
|
+
throw new Error(
|
|
34
|
+
`Unexpected embedding dimensions: ${vector.length}. Expected ${VECTOR_DIMENSIONS}.`
|
|
35
|
+
);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return this.l2Normalize(vector);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
private async bufferToRawImage(buffer: Buffer): Promise<unknown> {
|
|
42
|
+
const raw = RawImage as unknown as {
|
|
43
|
+
fromBlob?: (blob: Blob) => Promise<unknown>;
|
|
44
|
+
fromBuffer?: (b: Buffer) => Promise<unknown>;
|
|
45
|
+
read?: (b: Buffer) => Promise<unknown>;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
if (typeof raw.fromBlob === 'function') {
|
|
49
|
+
const view = Uint8Array.from(buffer);
|
|
50
|
+
return raw.fromBlob(new Blob([view], { type: 'image/png' }));
|
|
51
|
+
}
|
|
52
|
+
if (typeof raw.fromBuffer === 'function') {
|
|
53
|
+
return raw.fromBuffer(buffer);
|
|
54
|
+
}
|
|
55
|
+
if (typeof raw.read === 'function') {
|
|
56
|
+
return raw.read(buffer);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
throw new Error('No compatible RawImage constructor found in @xenova/transformers.');
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
private extractVector(output: unknown): Float32Array {
|
|
63
|
+
let data: number[] | Float32Array | undefined;
|
|
64
|
+
|
|
65
|
+
if (output instanceof Float32Array) {
|
|
66
|
+
data = output;
|
|
67
|
+
} else if (Array.isArray(output) && output.every((x) => typeof x === 'number')) {
|
|
68
|
+
data = output as number[];
|
|
69
|
+
} else if (typeof output === 'object' && output !== null) {
|
|
70
|
+
const maybeObject = output as { data?: number[] | Float32Array };
|
|
71
|
+
if (maybeObject.data) {
|
|
72
|
+
data = maybeObject.data;
|
|
73
|
+
} else if (Array.isArray(output)) {
|
|
74
|
+
const first = output[0] as { data?: number[] | Float32Array } | undefined;
|
|
75
|
+
data = first?.data;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (!data) {
|
|
80
|
+
throw new Error('Failed to extract vector data from model output.');
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return data instanceof Float32Array ? data : new Float32Array(data);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
private l2Normalize(vector: Float32Array): Float32Array {
|
|
87
|
+
let sumSquares = 0;
|
|
88
|
+
for (let i = 0; i < vector.length; i += 1) {
|
|
89
|
+
sumSquares += vector[i] * vector[i];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const norm = Math.sqrt(sumSquares) || 1;
|
|
93
|
+
const normalized = new Float32Array(vector.length);
|
|
94
|
+
for (let i = 0; i < vector.length; i += 1) {
|
|
95
|
+
normalized[i] = vector[i] / norm;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return normalized;
|
|
99
|
+
}
|
|
100
|
+
}
|