2ndbrain 2026.1.30 → 2026.1.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,281 @@
1
+ /**
2
+ * Embeddings engine -- manages pgvector-backed semantic search infrastructure.
3
+ *
4
+ * Handles startup configuration resolution per spec section 11.4:
5
+ * 1. Resolve dimensions from env var or model defaults
6
+ * 2. First-time setup: create extension, tables, index
7
+ * 3. Model switch: drop/recreate vector column, queue re-embedding
8
+ * 4. No change: skip
9
+ *
10
+ * Only creates pgvector tables when EMBEDDING_PROVIDER is set.
11
+ */
12
+
13
+ /**
14
+ * Default vector dimensions for known OpenAI embedding models.
15
+ */
16
+ const MODEL_DIMENSION_DEFAULTS = {
17
+ 'text-embedding-3-small': 1536,
18
+ 'text-embedding-3-large': 3072,
19
+ 'text-embedding-ada-002': 1536,
20
+ };
21
+
22
+ class EmbeddingsEngine {
23
+ /**
24
+ * @param {object} deps
25
+ * @param {object} deps.db - Database query interface ({ query(sql, params) }).
26
+ * @param {object} deps.config - Application configuration.
27
+ * @param {object} deps.logger - Logger instance.
28
+ */
29
+ constructor({ db, config, logger }) {
30
+ this.db = db;
31
+ this.config = config;
32
+ this.logger = logger;
33
+ this._dimensions = null;
34
+ }
35
+
36
+ /**
37
+ * Returns true when the embedding provider is configured.
38
+ *
39
+ * @returns {boolean}
40
+ */
41
+ isEnabled() {
42
+ return Boolean(this.config.EMBEDDING_PROVIDER);
43
+ }
44
+
45
+ /**
46
+ * Run startup configuration resolution.
47
+ *
48
+ * 1. Resolve dimensions: from EMBEDDING_DIMENSIONS env var, or model
49
+ * defaults (text-embedding-3-small=1536, text-embedding-3-large=3072,
50
+ * text-embedding-ada-002=1536). Fails startup if the model is unknown
51
+ * and no explicit dimension is provided.
52
+ * 2. First-time setup: CREATE EXTENSION IF NOT EXISTS vector, create
53
+ * embedding_config and embeddings tables, create HNSW index, insert
54
+ * config row.
55
+ * 3. Model switch: log warning, drop+recreate vector column with new
56
+ * dimensions, recreate index, update config. All existing rows become
57
+ * NULL-vector and are re-embedded by the background worker.
58
+ * 4. No change: skip.
59
+ */
60
+ async initialize() {
61
+ if (!this.isEnabled()) {
62
+ this.logger.info('embeddings', 'Embedding provider not configured; embeddings disabled.');
63
+ return;
64
+ }
65
+
66
+ const provider = this.config.EMBEDDING_PROVIDER;
67
+ const model = this.config.EMBEDDING_MODEL || 'text-embedding-3-small';
68
+ const dimensions = this._resolveDimensions(model);
69
+ this._dimensions = dimensions;
70
+
71
+ this.logger.info(
72
+ 'embeddings',
73
+ `Initializing embeddings: provider=${provider} model=${model} dimensions=${dimensions}`,
74
+ );
75
+
76
+ // Ensure the pgvector extension is available
77
+ await this.db.query('CREATE EXTENSION IF NOT EXISTS vector');
78
+
79
+ // Check whether the embedding_config table already exists
80
+ const tableCheck = await this.db.query(
81
+ `SELECT EXISTS (
82
+ SELECT FROM information_schema.tables
83
+ WHERE table_schema = 'public'
84
+ AND table_name = 'embedding_config'
85
+ ) AS table_exists`,
86
+ );
87
+
88
+ if (!tableCheck.rows[0].table_exists) {
89
+ await this._firstTimeSetup(provider, model, dimensions);
90
+ return;
91
+ }
92
+
93
+ // Table exists -- check for an existing config row
94
+ const configRow = await this.db.query(
95
+ 'SELECT provider, model, dimensions FROM embedding_config WHERE id = 1',
96
+ );
97
+
98
+ if (configRow.rows.length === 0) {
99
+ // Table present but empty -- treat as first-time setup
100
+ await this._firstTimeSetup(provider, model, dimensions);
101
+ return;
102
+ }
103
+
104
+ const current = configRow.rows[0];
105
+
106
+ if (
107
+ current.provider === provider &&
108
+ current.model === model &&
109
+ current.dimensions === dimensions
110
+ ) {
111
+ // Configuration unchanged
112
+ this.logger.info('embeddings', 'Embedding configuration unchanged.');
113
+ return;
114
+ }
115
+
116
+ // Configuration differs -- perform model switch
117
+ await this._handleModelSwitch(current, { provider, model, dimensions });
118
+ }
119
+
120
+ /**
121
+ * Queue an entity for background embedding generation.
122
+ * Inserts a row with a NULL vector; the background worker will fill it in.
123
+ *
124
+ * @param {string} entityType - Entity type (e.g. 'message', 'node', 'journal', 'issue').
125
+ * @param {number} entityId - Primary key of the source entity.
126
+ */
127
+ async queueEmbedding(entityType, entityId) {
128
+ if (!this.isEnabled()) {
129
+ return;
130
+ }
131
+
132
+ await this.db.query(
133
+ `INSERT INTO embeddings (entity_type, entity_id)
134
+ VALUES ($1, $2)
135
+ ON CONFLICT (entity_type, entity_id) DO NOTHING`,
136
+ [entityType, entityId],
137
+ );
138
+ }
139
+
140
+ // ---------------------------------------------------------------------------
141
+ // Internal helpers
142
+ // ---------------------------------------------------------------------------
143
+
144
+ /**
145
+ * Resolve the target vector dimensions from the EMBEDDING_DIMENSIONS env var
146
+ * or the known model defaults.
147
+ *
148
+ * @param {string} model - Embedding model name.
149
+ * @returns {number} Resolved dimension count.
150
+ * @throws {Error} When dimensions cannot be determined.
151
+ */
152
+ _resolveDimensions(model) {
153
+ if (this.config.EMBEDDING_DIMENSIONS) {
154
+ const dim = parseInt(this.config.EMBEDDING_DIMENSIONS, 10);
155
+ if (Number.isNaN(dim) || dim <= 0) {
156
+ throw new Error(
157
+ `Invalid EMBEDDING_DIMENSIONS value: "${this.config.EMBEDDING_DIMENSIONS}"`,
158
+ );
159
+ }
160
+ return dim;
161
+ }
162
+
163
+ const defaultDim = MODEL_DIMENSION_DEFAULTS[model];
164
+ if (!defaultDim) {
165
+ throw new Error(
166
+ `Unknown embedding model "${model}" and EMBEDDING_DIMENSIONS is not set. ` +
167
+ `Set EMBEDDING_DIMENSIONS explicitly or use a known model: ` +
168
+ `${Object.keys(MODEL_DIMENSION_DEFAULTS).join(', ')}`,
169
+ );
170
+ }
171
+
172
+ return defaultDim;
173
+ }
174
+
175
+ /**
176
+ * First-time setup: create the embedding_config and embeddings tables,
177
+ * the HNSW index, and the initial config row.
178
+ *
179
+ * @param {string} provider - Embedding provider name.
180
+ * @param {string} model - Embedding model name.
181
+ * @param {number} dimensions - Vector dimension count.
182
+ */
183
+ async _firstTimeSetup(provider, model, dimensions) {
184
+ this.logger.info('embeddings', 'First-time embedding setup: creating tables and index.');
185
+
186
+ // Create the single-row configuration table
187
+ await this.db.query(`
188
+ CREATE TABLE IF NOT EXISTS embedding_config (
189
+ id INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1),
190
+ provider TEXT NOT NULL,
191
+ model TEXT NOT NULL,
192
+ dimensions INTEGER NOT NULL,
193
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
194
+ updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
195
+ )
196
+ `);
197
+
198
+ // Create the embeddings table with the resolved vector dimension.
199
+ // NOTE: The dimension is a validated integer, not user input; string
200
+ // interpolation in the DDL statement is safe here because parameterized
201
+ // DDL is not supported by PostgreSQL for column type definitions.
202
+ await this.db.query(`
203
+ CREATE TABLE IF NOT EXISTS embeddings (
204
+ id SERIAL PRIMARY KEY,
205
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
206
+ updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
207
+ entity_type TEXT NOT NULL,
208
+ entity_id INTEGER NOT NULL,
209
+ vector VECTOR(${dimensions}),
210
+ UNIQUE(entity_type, entity_id)
211
+ )
212
+ `);
213
+
214
+ // HNSW index for fast approximate nearest-neighbor search (cosine distance)
215
+ await this.db.query(`
216
+ CREATE INDEX IF NOT EXISTS idx_embeddings_vector
217
+ ON embeddings USING hnsw (vector vector_cosine_ops)
218
+ `);
219
+
220
+ // Insert (or update) the config row
221
+ await this.db.query(
222
+ `INSERT INTO embedding_config (provider, model, dimensions)
223
+ VALUES ($1, $2, $3)
224
+ ON CONFLICT (id) DO UPDATE
225
+ SET provider = $1, model = $2, dimensions = $3, updated_at = NOW()`,
226
+ [provider, model, dimensions],
227
+ );
228
+
229
+ this.logger.info('embeddings', 'Embedding tables and index created successfully.');
230
+ }
231
+
232
+ /**
233
+ * Handle a model configuration change.
234
+ *
235
+ * Drops the existing vector column and HNSW index, recreates them with the
236
+ * new dimension, and updates the config row. All existing embedding rows
237
+ * are left with a NULL vector so the background worker re-generates them.
238
+ *
239
+ * @param {object} oldConfig - Previous { provider, model, dimensions }.
240
+ * @param {object} newConfig - New { provider, model, dimensions }.
241
+ */
242
+ async _handleModelSwitch(oldConfig, newConfig) {
243
+ this.logger.warn(
244
+ 'embeddings',
245
+ `Embedding model changed from ${oldConfig.provider}/${oldConfig.model} ` +
246
+ `(${oldConfig.dimensions}d) to ${newConfig.provider}/${newConfig.model} ` +
247
+ `(${newConfig.dimensions}d). All existing embeddings will be dropped and re-generated.`,
248
+ );
249
+
250
+ // Drop the HNSW index
251
+ await this.db.query('DROP INDEX IF EXISTS idx_embeddings_vector');
252
+
253
+ // Drop and recreate the vector column with the new dimension
254
+ await this.db.query('ALTER TABLE embeddings DROP COLUMN vector');
255
+ await this.db.query(
256
+ `ALTER TABLE embeddings ADD COLUMN vector VECTOR(${newConfig.dimensions})`,
257
+ );
258
+
259
+ // Recreate the HNSW index
260
+ await this.db.query(`
261
+ CREATE INDEX idx_embeddings_vector
262
+ ON embeddings USING hnsw (vector vector_cosine_ops)
263
+ `);
264
+
265
+ // Update the config row
266
+ await this.db.query(
267
+ `UPDATE embedding_config
268
+ SET provider = $1, model = $2, dimensions = $3, updated_at = NOW()
269
+ WHERE id = 1`,
270
+ [newConfig.provider, newConfig.model, newConfig.dimensions],
271
+ );
272
+
273
+ this.logger.info(
274
+ 'embeddings',
275
+ 'Model switch complete. All embeddings queued for re-generation.',
276
+ );
277
+ }
278
+ }
279
+
280
+ export { EmbeddingsEngine };
281
+ export default EmbeddingsEngine;
@@ -0,0 +1,221 @@
1
+ import { generateEmbedding } from '../mcp/embed-server.js';
2
+
3
+ /**
4
+ * Mapping of entity types to the SQL query that retrieves the text content
5
+ * to be embedded for a given entity_id.
6
+ */
7
+ const ENTITY_TEXT_SOURCES = {
8
+ message: {
9
+ query: 'SELECT content AS text FROM conversation_messages WHERE id = $1',
10
+ },
11
+ node: {
12
+ query: `SELECT name || COALESCE(' ' || note, '') AS text FROM knowledge_nodes WHERE id = $1`,
13
+ },
14
+ journal: {
15
+ query: 'SELECT note AS text FROM journal WHERE id = $1',
16
+ },
17
+ issue: {
18
+ query: 'SELECT note AS text FROM issues WHERE id = $1',
19
+ },
20
+ spec: {
21
+ query: 'SELECT note AS text FROM specifications WHERE id = $1',
22
+ },
23
+ };
24
+
25
+ /** Maximum rows to process in a single iteration. */
26
+ const BATCH_SIZE = 10;
27
+
28
+ /** Milliseconds between processing iterations. */
29
+ const POLL_INTERVAL_MS = 5_000;
30
+
31
+ /**
32
+ * Background embedding worker -- periodically processes rows in the
33
+ * embeddings table that have a NULL vector, generates the embedding via
34
+ * the configured API, and stores the result (spec section 11.4).
35
+ */
36
+ class EmbeddingWorker {
37
+ /**
38
+ * @param {object} deps
39
+ * @param {object} deps.db - Database query interface ({ query(sql, params) }).
40
+ * @param {object} deps.config - Application configuration.
41
+ * @param {object} deps.logger - Logger instance.
42
+ */
43
+ constructor({ db, config, logger }) {
44
+ this.db = db;
45
+ this.config = config;
46
+ this.logger = logger;
47
+
48
+ /** @type {ReturnType<typeof setTimeout>|null} */
49
+ this._timer = null;
50
+
51
+ /** Whether the worker loop is active. */
52
+ this._running = false;
53
+
54
+ /** Guard to prevent overlapping iterations. */
55
+ this._processing = false;
56
+ }
57
+
58
+ /**
59
+ * Start the periodic embedding worker loop.
60
+ * Processes up to {@link BATCH_SIZE} NULL-vector rows every
61
+ * {@link POLL_INTERVAL_MS} milliseconds.
62
+ */
63
+ start() {
64
+ if (this._running) {
65
+ return;
66
+ }
67
+
68
+ this._running = true;
69
+ this.logger.info('embedding-worker', 'Starting background embedding worker.');
70
+ this._scheduleNext();
71
+ }
72
+
73
+ /**
74
+ * Stop the worker loop gracefully. Any in-flight iteration will finish
75
+ * before the loop fully halts.
76
+ */
77
+ stop() {
78
+ this._running = false;
79
+
80
+ if (this._timer !== null) {
81
+ clearTimeout(this._timer);
82
+ this._timer = null;
83
+ }
84
+
85
+ this.logger.info('embedding-worker', 'Embedding worker stopped.');
86
+ }
87
+
88
+ // ---------------------------------------------------------------------------
89
+ // Internal
90
+ // ---------------------------------------------------------------------------
91
+
92
+ /**
93
+ * Schedule the next processing iteration after POLL_INTERVAL_MS.
94
+ */
95
+ _scheduleNext() {
96
+ if (!this._running) {
97
+ return;
98
+ }
99
+
100
+ this._timer = setTimeout(async () => {
101
+ this._timer = null;
102
+
103
+ // Skip if the previous iteration is still running
104
+ if (this._processing) {
105
+ this._scheduleNext();
106
+ return;
107
+ }
108
+
109
+ try {
110
+ this._processing = true;
111
+ await this._processQueue();
112
+ } catch (err) {
113
+ this.logger.error(
114
+ 'embedding-worker',
115
+ `Unexpected error in worker loop: ${err.message}`,
116
+ );
117
+ } finally {
118
+ this._processing = false;
119
+ this._scheduleNext();
120
+ }
121
+ }, POLL_INTERVAL_MS);
122
+ }
123
+
124
+ /**
125
+ * Fetch and process a batch of rows with NULL vectors.
126
+ */
127
+ async _processQueue() {
128
+ const result = await this.db.query(
129
+ `SELECT id, entity_type, entity_id
130
+ FROM embeddings
131
+ WHERE vector IS NULL
132
+ ORDER BY created_at ASC
133
+ LIMIT $1`,
134
+ [BATCH_SIZE],
135
+ );
136
+
137
+ if (result.rows.length === 0) {
138
+ return;
139
+ }
140
+
141
+ this.logger.debug(
142
+ 'embedding-worker',
143
+ `Processing ${result.rows.length} pending embedding(s).`,
144
+ );
145
+
146
+ for (const row of result.rows) {
147
+ try {
148
+ await this._processRow(row);
149
+ } catch (err) {
150
+ // Log the failure and continue with the next row
151
+ this.logger.error(
152
+ 'embedding-worker',
153
+ `Failed to generate embedding for ${row.entity_type}:${row.entity_id}: ${err.message}`,
154
+ );
155
+ }
156
+ }
157
+ }
158
+
159
+ /**
160
+ * Process a single embedding row: look up the source text, call the
161
+ * embedding API, and store the resulting vector.
162
+ *
163
+ * @param {{ id: number, entity_type: string, entity_id: number }} row
164
+ */
165
+ async _processRow(row) {
166
+ const { id, entity_type: entityType, entity_id: entityId } = row;
167
+
168
+ // Resolve the query for this entity type
169
+ const source = ENTITY_TEXT_SOURCES[entityType];
170
+ if (!source) {
171
+ this.logger.warn(
172
+ 'embedding-worker',
173
+ `Unknown entity type "${entityType}" for embedding ${id}; skipping.`,
174
+ );
175
+ return;
176
+ }
177
+
178
+ // Fetch the text content from the source table
179
+ const textResult = await this.db.query(source.query, [entityId]);
180
+
181
+ if (textResult.rows.length === 0) {
182
+ this.logger.warn(
183
+ 'embedding-worker',
184
+ `Source entity ${entityType}:${entityId} not found; removing orphaned embedding row ${id}.`,
185
+ );
186
+ await this.db.query('DELETE FROM embeddings WHERE id = $1', [id]);
187
+ return;
188
+ }
189
+
190
+ const text = textResult.rows[0].text;
191
+ if (!text || text.trim().length === 0) {
192
+ this.logger.debug(
193
+ 'embedding-worker',
194
+ `Empty text for ${entityType}:${entityId}; skipping embedding generation.`,
195
+ );
196
+ return;
197
+ }
198
+
199
+ // Generate the embedding vector via the configured API
200
+ const { vector } = await generateEmbedding(text, this.config);
201
+
202
+ // Format as a pgvector literal: [0.123,0.456,...]
203
+ const vectorLiteral = `[${vector.join(',')}]`;
204
+
205
+ // Update the row with the computed vector
206
+ await this.db.query(
207
+ `UPDATE embeddings
208
+ SET vector = $1::vector, updated_at = NOW()
209
+ WHERE id = $2`,
210
+ [vectorLiteral, id],
211
+ );
212
+
213
+ this.logger.debug(
214
+ 'embedding-worker',
215
+ `Generated embedding for ${entityType}:${entityId} (${vector.length} dimensions).`,
216
+ );
217
+ }
218
+ }
219
+
220
+ export { EmbeddingWorker };
221
+ export default EmbeddingWorker;