@karmaniverous/jeeves-watcher 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +28 -0
- package/README.md +236 -0
- package/dist/cjs/index.js +1577 -0
- package/dist/cli/jeeves-watcher/index.js +1765 -0
- package/dist/index.d.ts +615 -0
- package/dist/index.iife.js +1562 -0
- package/dist/index.iife.min.js +1 -0
- package/dist/mjs/index.js +1537 -0
- package/package.json +169 -0
|
@@ -0,0 +1,1562 @@
|
|
|
1
|
+
(function (exports, Fastify, node_crypto, promises, node_path, picomatch, chokidar, Ajv, cosmiconfig, googleGenai, pino, textsplitters, cheerio, yaml, mammoth, uuid, addFormats, jsClientRest) {
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
function _interopNamespaceDefault(e) {
|
|
5
|
+
var n = Object.create(null);
|
|
6
|
+
if (e) {
|
|
7
|
+
Object.keys(e).forEach(function (k) {
|
|
8
|
+
if (k !== 'default') {
|
|
9
|
+
var d = Object.getOwnPropertyDescriptor(e, k);
|
|
10
|
+
Object.defineProperty(n, k, d.get ? d : {
|
|
11
|
+
enumerable: true,
|
|
12
|
+
get: function () { return e[k]; }
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
});
|
|
16
|
+
}
|
|
17
|
+
n.default = e;
|
|
18
|
+
return Object.freeze(n);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
var cheerio__namespace = /*#__PURE__*/_interopNamespaceDefault(cheerio);
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Normalise a file path for deterministic mapping: lowercase, forward slashes, strip leading drive letter colon.
|
|
25
|
+
*
|
|
26
|
+
* @param filePath - The original file path.
|
|
27
|
+
* @returns The normalised path string.
|
|
28
|
+
*/
|
|
29
|
+
function normalisePath$1(filePath) {
|
|
30
|
+
return filePath
|
|
31
|
+
.replace(/\\/g, '/')
|
|
32
|
+
.replace(/^([A-Za-z]):/, (_m, letter) => letter.toLowerCase())
|
|
33
|
+
.toLowerCase();
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Derive a deterministic `.meta.json` path for a given file.
|
|
37
|
+
*
|
|
38
|
+
* @param filePath - The watched file path.
|
|
39
|
+
* @param metadataDir - The root metadata directory.
|
|
40
|
+
* @returns The full path to the metadata file.
|
|
41
|
+
*/
|
|
42
|
+
function metadataPath(filePath, metadataDir) {
|
|
43
|
+
const normalised = normalisePath$1(filePath);
|
|
44
|
+
const hash = node_crypto.createHash('sha256').update(normalised, 'utf8').digest('hex');
|
|
45
|
+
return node_path.join(metadataDir, `${hash}.meta.json`);
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Read persisted metadata for a file.
|
|
49
|
+
*
|
|
50
|
+
* @param filePath - The watched file path.
|
|
51
|
+
* @param metadataDir - The root metadata directory.
|
|
52
|
+
* @returns The parsed metadata object, or `null` if not found.
|
|
53
|
+
*/
|
|
54
|
+
async function readMetadata(filePath, metadataDir) {
|
|
55
|
+
try {
|
|
56
|
+
const raw = await promises.readFile(metadataPath(filePath, metadataDir), 'utf8');
|
|
57
|
+
return JSON.parse(raw);
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Write metadata for a file.
|
|
65
|
+
*
|
|
66
|
+
* @param filePath - The watched file path.
|
|
67
|
+
* @param metadataDir - The root metadata directory.
|
|
68
|
+
* @param metadata - The metadata to persist.
|
|
69
|
+
*/
|
|
70
|
+
async function writeMetadata(filePath, metadataDir, metadata) {
|
|
71
|
+
const dest = metadataPath(filePath, metadataDir);
|
|
72
|
+
await promises.mkdir(node_path.dirname(dest), { recursive: true });
|
|
73
|
+
await promises.writeFile(dest, JSON.stringify(metadata, null, 2), 'utf8');
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Delete metadata for a file.
|
|
77
|
+
*
|
|
78
|
+
* @param filePath - The watched file path.
|
|
79
|
+
* @param metadataDir - The root metadata directory.
|
|
80
|
+
*/
|
|
81
|
+
async function deleteMetadata(filePath, metadataDir) {
|
|
82
|
+
try {
|
|
83
|
+
await promises.rm(metadataPath(filePath, metadataDir));
|
|
84
|
+
}
|
|
85
|
+
catch {
|
|
86
|
+
// Ignore if file doesn't exist.
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Best-effort base directory inference for a glob pattern.
|
|
92
|
+
*
|
|
93
|
+
* For our use (watch paths in config), this only needs to be good enough
|
|
94
|
+
* to scan the directory tree in integration tests.
|
|
95
|
+
*/
|
|
96
|
+
function globBase(pattern) {
|
|
97
|
+
const normalised = pattern.replace(/\\/g, '/');
|
|
98
|
+
// eslint-disable-next-line no-useless-escape
|
|
99
|
+
const globIdx = normalised.search(/[*?\[]/);
|
|
100
|
+
if (globIdx === -1)
|
|
101
|
+
return node_path.resolve(pattern);
|
|
102
|
+
const prefix = normalised.slice(0, globIdx);
|
|
103
|
+
// If prefix ends mid-segment, dirname to get a real directory
|
|
104
|
+
const base = prefix.endsWith('/') ? prefix.slice(0, -1) : node_path.dirname(prefix);
|
|
105
|
+
return node_path.resolve(base);
|
|
106
|
+
}
|
|
107
|
+
async function* walk(dir) {
|
|
108
|
+
let entries;
|
|
109
|
+
try {
|
|
110
|
+
const dirents = await promises.readdir(dir, { withFileTypes: true });
|
|
111
|
+
entries = dirents.map((d) => ({
|
|
112
|
+
name: d.name,
|
|
113
|
+
isDirectory: d.isDirectory(),
|
|
114
|
+
}));
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
for (const entry of entries) {
|
|
120
|
+
const full = node_path.resolve(dir, entry.name);
|
|
121
|
+
if (entry.isDirectory) {
|
|
122
|
+
yield* walk(full);
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
// ensure it's a file
|
|
126
|
+
try {
|
|
127
|
+
const st = await promises.stat(full);
|
|
128
|
+
if (st.isFile())
|
|
129
|
+
yield full;
|
|
130
|
+
}
|
|
131
|
+
catch {
|
|
132
|
+
// ignore
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* List files matching a set of globs, with optional ignore globs.
|
|
139
|
+
*/
|
|
140
|
+
async function listFilesFromGlobs(patterns, ignored = []) {
|
|
141
|
+
const normPatterns = patterns.map((p) => p.replace(/\\/g, '/'));
|
|
142
|
+
const normIgnored = ignored.map((p) => p.replace(/\\/g, '/'));
|
|
143
|
+
const match = picomatch(normPatterns, { dot: true });
|
|
144
|
+
const ignore = normIgnored.length
|
|
145
|
+
? picomatch(normIgnored, { dot: true })
|
|
146
|
+
: () => false;
|
|
147
|
+
const bases = Array.from(new Set(patterns.map(globBase)));
|
|
148
|
+
const seen = new Set();
|
|
149
|
+
for (const base of bases) {
|
|
150
|
+
for await (const file of walk(base)) {
|
|
151
|
+
const rel = file.replace(/\\/g, '/');
|
|
152
|
+
if (ignore(rel))
|
|
153
|
+
continue;
|
|
154
|
+
if (!match(rel))
|
|
155
|
+
continue;
|
|
156
|
+
seen.add(file);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return Array.from(seen);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Create the Fastify API server with all routes registered.
|
|
164
|
+
*
|
|
165
|
+
* The returned instance is not yet listening — call `server.listen()` to start.
|
|
166
|
+
*
|
|
167
|
+
* @param options - The server options.
|
|
168
|
+
* @returns A configured Fastify instance.
|
|
169
|
+
*/
|
|
170
|
+
function createApiServer(options) {
|
|
171
|
+
const { processor, vectorStore, embeddingProvider, logger } = options;
|
|
172
|
+
const app = Fastify({ logger: false });
|
|
173
|
+
app.get('/status', () => ({
|
|
174
|
+
status: 'ok',
|
|
175
|
+
uptime: process.uptime(),
|
|
176
|
+
}));
|
|
177
|
+
app.post('/metadata', async (request, reply) => {
|
|
178
|
+
try {
|
|
179
|
+
const { path, metadata } = request.body;
|
|
180
|
+
await processor.processMetadataUpdate(path, metadata);
|
|
181
|
+
return { ok: true };
|
|
182
|
+
}
|
|
183
|
+
catch (error) {
|
|
184
|
+
logger.error({ error }, 'Metadata update failed');
|
|
185
|
+
return reply.status(500).send({ error: 'Internal server error' });
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
app.post('/search', async (request, reply) => {
|
|
189
|
+
try {
|
|
190
|
+
const { query, limit = 10 } = request.body;
|
|
191
|
+
const vectors = await embeddingProvider.embed([query]);
|
|
192
|
+
const results = await vectorStore.search(vectors[0], limit);
|
|
193
|
+
return results;
|
|
194
|
+
}
|
|
195
|
+
catch (error) {
|
|
196
|
+
logger.error({ error }, 'Search failed');
|
|
197
|
+
return reply.status(500).send({ error: 'Internal server error' });
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
app.post('/reindex', async (_request, reply) => {
|
|
201
|
+
try {
|
|
202
|
+
const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
|
|
203
|
+
for (const file of files) {
|
|
204
|
+
// Sequential on purpose to avoid surprising load.
|
|
205
|
+
// Queue integration can come later.
|
|
206
|
+
await processor.processFile(file);
|
|
207
|
+
}
|
|
208
|
+
return await reply
|
|
209
|
+
.status(200)
|
|
210
|
+
.send({ ok: true, filesIndexed: files.length });
|
|
211
|
+
}
|
|
212
|
+
catch (error) {
|
|
213
|
+
logger.error({ error }, 'Reindex failed');
|
|
214
|
+
return await reply.status(500).send({ error: 'Internal server error' });
|
|
215
|
+
}
|
|
216
|
+
});
|
|
217
|
+
app.post('/rebuild-metadata', async (_request, reply) => {
|
|
218
|
+
try {
|
|
219
|
+
const metadataDir = options.config.metadataDir ?? '.jeeves-metadata';
|
|
220
|
+
for await (const point of vectorStore.scroll()) {
|
|
221
|
+
const payload = point.payload;
|
|
222
|
+
const filePath = payload['file_path'];
|
|
223
|
+
if (typeof filePath !== 'string' || filePath.length === 0)
|
|
224
|
+
continue;
|
|
225
|
+
// Persist only enrichment-ish fields, not chunking/index fields.
|
|
226
|
+
const rest = { ...payload };
|
|
227
|
+
delete rest.file_path;
|
|
228
|
+
delete rest.chunk_index;
|
|
229
|
+
delete rest.total_chunks;
|
|
230
|
+
delete rest.content_hash;
|
|
231
|
+
delete rest.chunk_text;
|
|
232
|
+
await writeMetadata(filePath, metadataDir, rest);
|
|
233
|
+
}
|
|
234
|
+
return await reply.status(200).send({ ok: true });
|
|
235
|
+
}
|
|
236
|
+
catch (error) {
|
|
237
|
+
logger.error({ error }, 'Rebuild metadata failed');
|
|
238
|
+
return await reply.status(500).send({ error: 'Internal server error' });
|
|
239
|
+
}
|
|
240
|
+
});
|
|
241
|
+
app.post('/config-reindex', async (request, reply) => {
|
|
242
|
+
try {
|
|
243
|
+
const scope = request.body.scope ?? 'rules';
|
|
244
|
+
// Return immediately and run async
|
|
245
|
+
void (async () => {
|
|
246
|
+
try {
|
|
247
|
+
if (scope === 'rules') {
|
|
248
|
+
// Re-apply inference rules to all files, update Qdrant payloads (no re-embedding)
|
|
249
|
+
const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
|
|
250
|
+
for (const file of files) {
|
|
251
|
+
// Use the new processRulesUpdate method
|
|
252
|
+
await processor.processRulesUpdate(file);
|
|
253
|
+
}
|
|
254
|
+
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (rules) completed');
|
|
255
|
+
}
|
|
256
|
+
else {
|
|
257
|
+
// Full reindex: re-extract, re-embed, re-upsert
|
|
258
|
+
const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
|
|
259
|
+
for (const file of files) {
|
|
260
|
+
await processor.processFile(file);
|
|
261
|
+
}
|
|
262
|
+
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (full) completed');
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
catch (error) {
|
|
266
|
+
logger.error({ error, scope }, 'Config reindex failed');
|
|
267
|
+
}
|
|
268
|
+
})();
|
|
269
|
+
return await reply.status(200).send({ status: 'started', scope });
|
|
270
|
+
}
|
|
271
|
+
catch (error) {
|
|
272
|
+
logger.error({ error }, 'Config reindex request failed');
|
|
273
|
+
return await reply.status(500).send({ error: 'Internal server error' });
|
|
274
|
+
}
|
|
275
|
+
});
|
|
276
|
+
return app;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
const MODULE_NAME = 'jeeves-watcher';
|
|
280
|
+
/** JSON Schema for validating jeeves-watcher configuration. */
|
|
281
|
+
const configSchema = {
|
|
282
|
+
type: 'object',
|
|
283
|
+
required: ['watch', 'embedding', 'vectorStore'],
|
|
284
|
+
properties: {
|
|
285
|
+
watch: {
|
|
286
|
+
type: 'object',
|
|
287
|
+
required: ['paths'],
|
|
288
|
+
properties: {
|
|
289
|
+
paths: { type: 'array', items: { type: 'string' }, minItems: 1 },
|
|
290
|
+
ignored: { type: 'array', items: { type: 'string' } },
|
|
291
|
+
pollIntervalMs: { type: 'number' },
|
|
292
|
+
usePolling: { type: 'boolean' },
|
|
293
|
+
debounceMs: { type: 'number' },
|
|
294
|
+
stabilityThresholdMs: { type: 'number' },
|
|
295
|
+
},
|
|
296
|
+
additionalProperties: false,
|
|
297
|
+
},
|
|
298
|
+
configWatch: {
|
|
299
|
+
type: 'object',
|
|
300
|
+
properties: {
|
|
301
|
+
enabled: { type: 'boolean' },
|
|
302
|
+
debounceMs: { type: 'number' },
|
|
303
|
+
},
|
|
304
|
+
additionalProperties: false,
|
|
305
|
+
},
|
|
306
|
+
embedding: {
|
|
307
|
+
type: 'object',
|
|
308
|
+
required: ['provider', 'model'],
|
|
309
|
+
properties: {
|
|
310
|
+
provider: { type: 'string' },
|
|
311
|
+
model: { type: 'string' },
|
|
312
|
+
chunkSize: { type: 'number' },
|
|
313
|
+
chunkOverlap: { type: 'number' },
|
|
314
|
+
dimensions: { type: 'number' },
|
|
315
|
+
apiKey: { type: 'string' },
|
|
316
|
+
rateLimitPerMinute: { type: 'number' },
|
|
317
|
+
concurrency: { type: 'number' },
|
|
318
|
+
},
|
|
319
|
+
additionalProperties: false,
|
|
320
|
+
},
|
|
321
|
+
vectorStore: {
|
|
322
|
+
type: 'object',
|
|
323
|
+
required: ['url', 'collectionName'],
|
|
324
|
+
properties: {
|
|
325
|
+
url: { type: 'string' },
|
|
326
|
+
collectionName: { type: 'string' },
|
|
327
|
+
apiKey: { type: 'string' },
|
|
328
|
+
},
|
|
329
|
+
additionalProperties: false,
|
|
330
|
+
},
|
|
331
|
+
metadataDir: { type: 'string' },
|
|
332
|
+
api: {
|
|
333
|
+
type: 'object',
|
|
334
|
+
properties: {
|
|
335
|
+
host: { type: 'string' },
|
|
336
|
+
port: { type: 'number' },
|
|
337
|
+
},
|
|
338
|
+
additionalProperties: false,
|
|
339
|
+
},
|
|
340
|
+
extractors: { type: 'object' },
|
|
341
|
+
inferenceRules: {
|
|
342
|
+
type: 'array',
|
|
343
|
+
items: {
|
|
344
|
+
type: 'object',
|
|
345
|
+
required: ['match', 'set'],
|
|
346
|
+
properties: {
|
|
347
|
+
match: { type: 'object' },
|
|
348
|
+
set: { type: 'object' },
|
|
349
|
+
},
|
|
350
|
+
additionalProperties: false,
|
|
351
|
+
},
|
|
352
|
+
},
|
|
353
|
+
logging: {
|
|
354
|
+
type: 'object',
|
|
355
|
+
properties: {
|
|
356
|
+
level: { type: 'string' },
|
|
357
|
+
file: { type: 'string' },
|
|
358
|
+
},
|
|
359
|
+
additionalProperties: false,
|
|
360
|
+
},
|
|
361
|
+
shutdownTimeoutMs: { type: 'number' },
|
|
362
|
+
},
|
|
363
|
+
additionalProperties: false,
|
|
364
|
+
};
|
|
365
|
+
const ajv = new Ajv({ allErrors: true });
|
|
366
|
+
const validate = ajv.compile(configSchema);
|
|
367
|
+
/** Default values for optional configuration fields. */
|
|
368
|
+
const DEFAULTS = {
|
|
369
|
+
configWatch: { enabled: true, debounceMs: 1000 },
|
|
370
|
+
metadataDir: '.jeeves-watcher',
|
|
371
|
+
api: { host: '127.0.0.1', port: 3100 },
|
|
372
|
+
logging: { level: 'info' },
|
|
373
|
+
shutdownTimeoutMs: 10000,
|
|
374
|
+
};
|
|
375
|
+
/** Default values for watch configuration. */
|
|
376
|
+
const WATCH_DEFAULTS = {
|
|
377
|
+
debounceMs: 300,
|
|
378
|
+
stabilityThresholdMs: 500,
|
|
379
|
+
usePolling: false,
|
|
380
|
+
pollIntervalMs: 1000,
|
|
381
|
+
};
|
|
382
|
+
/** Default values for embedding configuration. */
|
|
383
|
+
const EMBEDDING_DEFAULTS = {
|
|
384
|
+
chunkSize: 1000,
|
|
385
|
+
chunkOverlap: 200,
|
|
386
|
+
dimensions: 768,
|
|
387
|
+
rateLimitPerMinute: 300,
|
|
388
|
+
concurrency: 5,
|
|
389
|
+
};
|
|
390
|
+
/**
|
|
391
|
+
* Merge sensible defaults into a loaded configuration.
|
|
392
|
+
*
|
|
393
|
+
* @param raw - The raw loaded configuration.
|
|
394
|
+
* @returns The configuration with defaults applied.
|
|
395
|
+
*/
|
|
396
|
+
function applyDefaults(raw) {
|
|
397
|
+
return {
|
|
398
|
+
...DEFAULTS,
|
|
399
|
+
...raw,
|
|
400
|
+
watch: { ...WATCH_DEFAULTS, ...raw.watch },
|
|
401
|
+
configWatch: { ...DEFAULTS.configWatch, ...raw.configWatch },
|
|
402
|
+
embedding: { ...EMBEDDING_DEFAULTS, ...raw.embedding },
|
|
403
|
+
api: { ...DEFAULTS.api, ...raw.api },
|
|
404
|
+
logging: { ...DEFAULTS.logging, ...raw.logging },
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Load the jeeves-watcher configuration.
|
|
409
|
+
*
|
|
410
|
+
* @param configPath - Optional explicit path to a config file.
|
|
411
|
+
* @returns The loaded configuration.
|
|
412
|
+
* @throws If no configuration is found or validation fails.
|
|
413
|
+
*/
|
|
414
|
+
async function loadConfig(configPath) {
|
|
415
|
+
const explorer = cosmiconfig.cosmiconfig(MODULE_NAME);
|
|
416
|
+
const result = configPath
|
|
417
|
+
? await explorer.load(configPath)
|
|
418
|
+
: await explorer.search();
|
|
419
|
+
if (!result || result.isEmpty) {
|
|
420
|
+
throw new Error('No jeeves-watcher configuration found. Create a .jeeves-watcherrc or jeeves-watcher.config.{js,ts,json,yaml} file.');
|
|
421
|
+
}
|
|
422
|
+
const raw = result.config;
|
|
423
|
+
if (!validate(raw)) {
|
|
424
|
+
const errors = validate.errors
|
|
425
|
+
?.map((e) => {
|
|
426
|
+
const instancePath = 'instancePath' in e
|
|
427
|
+
? e.instancePath
|
|
428
|
+
: undefined;
|
|
429
|
+
return `${instancePath ?? '/'}: ${e.message ?? 'unknown error'}`;
|
|
430
|
+
})
|
|
431
|
+
.join('; ');
|
|
432
|
+
throw new Error(`Invalid jeeves-watcher configuration: ${errors ?? 'unknown error'}`);
|
|
433
|
+
}
|
|
434
|
+
return applyDefaults(raw);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Create a mock embedding provider that generates deterministic vectors from content hashes.
|
|
439
|
+
*
|
|
440
|
+
* @param dimensions - The number of dimensions for the output vectors.
|
|
441
|
+
* @returns A mock {@link EmbeddingProvider}.
|
|
442
|
+
*/
|
|
443
|
+
function createMockProvider(dimensions) {
|
|
444
|
+
return {
|
|
445
|
+
dimensions,
|
|
446
|
+
embed(texts) {
|
|
447
|
+
return Promise.resolve(texts.map((text) => {
|
|
448
|
+
const hash = node_crypto.createHash('sha256').update(text, 'utf8').digest();
|
|
449
|
+
const vector = [];
|
|
450
|
+
for (let i = 0; i < dimensions; i++) {
|
|
451
|
+
// Use bytes cyclically from the hash to generate deterministic floats in [-1, 1]
|
|
452
|
+
const byte = hash[i % hash.length];
|
|
453
|
+
vector.push(byte / 127.5 - 1);
|
|
454
|
+
}
|
|
455
|
+
return vector;
|
|
456
|
+
}));
|
|
457
|
+
},
|
|
458
|
+
};
|
|
459
|
+
}
|
|
460
|
+
/**
|
|
461
|
+
* Create a Gemini embedding provider using the Google Generative AI SDK.
|
|
462
|
+
*
|
|
463
|
+
* @param config - The embedding configuration.
|
|
464
|
+
* @returns A Gemini {@link EmbeddingProvider}.
|
|
465
|
+
* @throws If the API key is missing.
|
|
466
|
+
*/
|
|
467
|
+
function createGeminiProvider(config) {
|
|
468
|
+
if (!config.apiKey) {
|
|
469
|
+
throw new Error('Gemini embedding provider requires config.embedding.apiKey');
|
|
470
|
+
}
|
|
471
|
+
const dimensions = config.dimensions ?? 3072;
|
|
472
|
+
const embedder = new googleGenai.GoogleGenerativeAIEmbeddings({
|
|
473
|
+
apiKey: config.apiKey,
|
|
474
|
+
model: config.model,
|
|
475
|
+
});
|
|
476
|
+
return {
|
|
477
|
+
dimensions,
|
|
478
|
+
async embed(texts) {
|
|
479
|
+
// embedDocuments returns vectors for multiple texts
|
|
480
|
+
const vectors = await embedder.embedDocuments(texts);
|
|
481
|
+
// Validate dimensions
|
|
482
|
+
for (const vector of vectors) {
|
|
483
|
+
if (vector.length !== dimensions) {
|
|
484
|
+
throw new Error(`Gemini embedding returned invalid dimensions: expected ${String(dimensions)}, got ${String(vector.length)}`);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
return vectors;
|
|
488
|
+
},
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
/**
|
|
492
|
+
* Create an embedding provider based on the given configuration.
|
|
493
|
+
*
|
|
494
|
+
* @param config - The embedding configuration.
|
|
495
|
+
* @returns An {@link EmbeddingProvider} instance.
|
|
496
|
+
* @throws If the configured provider is not supported.
|
|
497
|
+
*/
|
|
498
|
+
function createEmbeddingProvider(config) {
|
|
499
|
+
const dimensions = config.dimensions ?? 768;
|
|
500
|
+
switch (config.provider) {
|
|
501
|
+
case 'mock':
|
|
502
|
+
return createMockProvider(dimensions);
|
|
503
|
+
case 'gemini':
|
|
504
|
+
return createGeminiProvider(config);
|
|
505
|
+
default:
|
|
506
|
+
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
/**
|
|
511
|
+
* Create a pino logger instance.
|
|
512
|
+
*
|
|
513
|
+
* @param config - Optional logging configuration.
|
|
514
|
+
* @returns A configured pino logger.
|
|
515
|
+
*/
|
|
516
|
+
function createLogger(config) {
|
|
517
|
+
const level = config?.level ?? 'info';
|
|
518
|
+
if (config?.file) {
|
|
519
|
+
const transport = pino.transport({
|
|
520
|
+
target: 'pino/file',
|
|
521
|
+
options: { destination: config.file, mkdir: true },
|
|
522
|
+
});
|
|
523
|
+
return pino({ level }, transport);
|
|
524
|
+
}
|
|
525
|
+
return pino({ level });
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Extract YAML frontmatter from a Markdown document.
|
|
530
|
+
*
|
|
531
|
+
* @param markdown - The raw markdown content.
|
|
532
|
+
* @returns The extracted frontmatter (if any) and body.
|
|
533
|
+
*/
|
|
534
|
+
function extractMarkdownFrontmatter(markdown) {
|
|
535
|
+
const trimmed = markdown.replace(/^\uFEFF/, '');
|
|
536
|
+
const match = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/m.exec(trimmed);
|
|
537
|
+
if (!match)
|
|
538
|
+
return { body: markdown };
|
|
539
|
+
const [, rawYaml, body] = match;
|
|
540
|
+
const parsed = yaml.load(rawYaml);
|
|
541
|
+
const frontmatter = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
542
|
+
? parsed
|
|
543
|
+
: undefined;
|
|
544
|
+
return { frontmatter, body };
|
|
545
|
+
}
|
|
546
|
+
const JSON_TEXT_FIELDS = [
|
|
547
|
+
'content',
|
|
548
|
+
'body',
|
|
549
|
+
'text',
|
|
550
|
+
'snippet',
|
|
551
|
+
'subject',
|
|
552
|
+
'description',
|
|
553
|
+
'summary',
|
|
554
|
+
'transcript',
|
|
555
|
+
];
|
|
556
|
+
/**
|
|
557
|
+
* Extract meaningful text from a parsed JSON object.
|
|
558
|
+
*
|
|
559
|
+
* @param obj - Parsed JSON content.
|
|
560
|
+
* @returns A text representation for embedding.
|
|
561
|
+
*/
|
|
562
|
+
function extractJsonText(obj) {
|
|
563
|
+
if (!obj || typeof obj !== 'object')
|
|
564
|
+
return JSON.stringify(obj);
|
|
565
|
+
const rec = obj;
|
|
566
|
+
for (const field of JSON_TEXT_FIELDS) {
|
|
567
|
+
const value = rec[field];
|
|
568
|
+
if (typeof value === 'string' && value.trim())
|
|
569
|
+
return value;
|
|
570
|
+
}
|
|
571
|
+
return JSON.stringify(obj);
|
|
572
|
+
}
|
|
573
|
+
/**
|
|
574
|
+
* Extract text from a file based on extension.
|
|
575
|
+
*
|
|
576
|
+
* @param filePath - Path to the file.
|
|
577
|
+
* @param extension - File extension (including leading dot).
|
|
578
|
+
* @returns Extracted text and optional structured data.
|
|
579
|
+
*/
|
|
580
|
+
async function extractText(filePath, extension) {
|
|
581
|
+
const ext = extension.toLowerCase();
|
|
582
|
+
if (ext === '.md' || ext === '.markdown') {
|
|
583
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
584
|
+
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
585
|
+
return { text: body, frontmatter };
|
|
586
|
+
}
|
|
587
|
+
if (ext === '.txt' || ext === '.text') {
|
|
588
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
589
|
+
return { text: raw };
|
|
590
|
+
}
|
|
591
|
+
if (ext === '.json') {
|
|
592
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
593
|
+
const parsed = JSON.parse(raw);
|
|
594
|
+
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
595
|
+
? parsed
|
|
596
|
+
: undefined;
|
|
597
|
+
return { text: extractJsonText(parsed), json };
|
|
598
|
+
}
|
|
599
|
+
if (ext === '.pdf') {
|
|
600
|
+
const buffer = await promises.readFile(filePath);
|
|
601
|
+
const uint8Array = new Uint8Array(buffer);
|
|
602
|
+
const { extractText: extractPdfText } = await import('unpdf');
|
|
603
|
+
const { text } = await extractPdfText(uint8Array);
|
|
604
|
+
// unpdf returns an array of strings (one per page)
|
|
605
|
+
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
606
|
+
return { text: content };
|
|
607
|
+
}
|
|
608
|
+
if (ext === '.docx') {
|
|
609
|
+
const buffer = await promises.readFile(filePath);
|
|
610
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
611
|
+
return { text: result.value };
|
|
612
|
+
}
|
|
613
|
+
if (ext === '.html' || ext === '.htm') {
|
|
614
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
615
|
+
const $ = cheerio__namespace.load(raw);
|
|
616
|
+
// Remove script and style elements
|
|
617
|
+
$('script, style').remove();
|
|
618
|
+
// Extract text content
|
|
619
|
+
const text = $('body').text().trim() || $.text().trim();
|
|
620
|
+
return { text };
|
|
621
|
+
}
|
|
622
|
+
// Default: treat as plaintext.
|
|
623
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
624
|
+
return { text: raw };
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
/**
|
|
628
|
+
* Compute a SHA-256 hex digest of the given text.
|
|
629
|
+
*
|
|
630
|
+
* @param text - The input text to hash.
|
|
631
|
+
* @returns The hex-encoded SHA-256 hash.
|
|
632
|
+
*/
|
|
633
|
+
function contentHash(text) {
|
|
634
|
+
return node_crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
638
|
+
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
639
|
+
/**
|
|
640
|
+
* Normalise a file path for deterministic point ID generation.
|
|
641
|
+
*
|
|
642
|
+
* @param filePath - The original file path.
|
|
643
|
+
* @returns The normalised path string.
|
|
644
|
+
*/
|
|
645
|
+
function normalisePath(filePath) {
|
|
646
|
+
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
647
|
+
}
|
|
648
|
+
/**
|
|
649
|
+
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
650
|
+
*
|
|
651
|
+
* @param filePath - The file path.
|
|
652
|
+
* @param chunkIndex - Optional chunk index within the file.
|
|
653
|
+
* @returns A deterministic UUID v5 string.
|
|
654
|
+
*/
|
|
655
|
+
function pointId(filePath, chunkIndex) {
|
|
656
|
+
const key = chunkIndex !== undefined
|
|
657
|
+
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
658
|
+
: normalisePath(filePath);
|
|
659
|
+
return uuid.v5(key, NAMESPACE);
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/**
|
|
663
|
+
* Build {@link FileAttributes} from a file path and stat info.
|
|
664
|
+
*
|
|
665
|
+
* @param filePath - The file path.
|
|
666
|
+
* @param stats - The file stats.
|
|
667
|
+
* @param extractedFrontmatter - Optional extracted frontmatter.
|
|
668
|
+
* @param extractedJson - Optional parsed JSON content.
|
|
669
|
+
* @returns The constructed file attributes.
|
|
670
|
+
*/
|
|
671
|
+
function buildAttributes(filePath, stats, extractedFrontmatter, extractedJson) {
|
|
672
|
+
const normalised = filePath.replace(/\\/g, '/');
|
|
673
|
+
const attrs = {
|
|
674
|
+
file: {
|
|
675
|
+
path: normalised,
|
|
676
|
+
directory: node_path.dirname(normalised).replace(/\\/g, '/'),
|
|
677
|
+
filename: node_path.basename(normalised),
|
|
678
|
+
extension: node_path.extname(normalised),
|
|
679
|
+
sizeBytes: stats.size,
|
|
680
|
+
modified: stats.mtime.toISOString(),
|
|
681
|
+
},
|
|
682
|
+
};
|
|
683
|
+
if (extractedFrontmatter)
|
|
684
|
+
attrs.frontmatter = extractedFrontmatter;
|
|
685
|
+
if (extractedJson)
|
|
686
|
+
attrs.json = extractedJson;
|
|
687
|
+
return attrs;
|
|
688
|
+
}
|
|
689
|
+
/**
|
|
690
|
+
* Create an ajv instance with a custom `glob` format for picomatch glob matching.
|
|
691
|
+
*
|
|
692
|
+
* @returns The configured ajv instance.
|
|
693
|
+
*/
|
|
694
|
+
function createRuleAjv() {
|
|
695
|
+
const ajv = new Ajv({ allErrors: true });
|
|
696
|
+
addFormats(ajv);
|
|
697
|
+
ajv.addKeyword({
|
|
698
|
+
keyword: 'glob',
|
|
699
|
+
type: 'string',
|
|
700
|
+
schemaType: 'string',
|
|
701
|
+
validate: (pattern, data) => picomatch.isMatch(data, pattern),
|
|
702
|
+
});
|
|
703
|
+
return ajv;
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Compile an array of inference rules into executable validators.
|
|
707
|
+
*
|
|
708
|
+
* @param rules - The inference rule definitions.
|
|
709
|
+
* @returns An array of compiled rules.
|
|
710
|
+
*/
|
|
711
|
+
function compileRules(rules) {
|
|
712
|
+
const ajv = createRuleAjv();
|
|
713
|
+
return rules.map((rule, idx) => ({
|
|
714
|
+
rule,
|
|
715
|
+
validate: ajv.compile({
|
|
716
|
+
$id: `rule-${String(idx)}`,
|
|
717
|
+
...rule.match,
|
|
718
|
+
}),
|
|
719
|
+
}));
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Resolve `$\{template.vars\}` in a value against the given attributes.
|
|
723
|
+
*
|
|
724
|
+
* @param value - The value to resolve.
|
|
725
|
+
* @param attributes - The file attributes for variable lookup.
|
|
726
|
+
* @returns The resolved value.
|
|
727
|
+
*/
|
|
728
|
+
function resolveTemplateVars(value, attributes) {
|
|
729
|
+
if (typeof value !== 'string')
|
|
730
|
+
return value;
|
|
731
|
+
return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
|
|
732
|
+
const parts = varPath.split('.');
|
|
733
|
+
let current = attributes;
|
|
734
|
+
for (const part of parts) {
|
|
735
|
+
if (current === null || current === undefined)
|
|
736
|
+
return '';
|
|
737
|
+
current = current[part];
|
|
738
|
+
}
|
|
739
|
+
if (current === null || current === undefined)
|
|
740
|
+
return '';
|
|
741
|
+
return typeof current === 'string' ? current : JSON.stringify(current);
|
|
742
|
+
});
|
|
743
|
+
}
|
|
744
|
+
/**
|
|
745
|
+
* Resolve all template variables in a `set` object.
|
|
746
|
+
*
|
|
747
|
+
* @param setObj - The key-value pairs to resolve.
|
|
748
|
+
* @param attributes - The file attributes for variable lookup.
|
|
749
|
+
* @returns The resolved key-value pairs.
|
|
750
|
+
*/
|
|
751
|
+
function resolveSet(setObj, attributes) {
|
|
752
|
+
const result = {};
|
|
753
|
+
for (const [key, value] of Object.entries(setObj)) {
|
|
754
|
+
result[key] = resolveTemplateVars(value, attributes);
|
|
755
|
+
}
|
|
756
|
+
return result;
|
|
757
|
+
}
|
|
758
|
+
/**
|
|
759
|
+
* Apply compiled inference rules to file attributes, returning merged metadata.
|
|
760
|
+
*
|
|
761
|
+
* Rules are evaluated in order; later rules override earlier ones.
|
|
762
|
+
*
|
|
763
|
+
* @param compiledRules - The compiled rules to evaluate.
|
|
764
|
+
* @param attributes - The file attributes to match against.
|
|
765
|
+
* @returns The merged metadata from all matching rules.
|
|
766
|
+
*/
|
|
767
|
+
function applyRules(compiledRules, attributes) {
|
|
768
|
+
let merged = {};
|
|
769
|
+
for (const { rule, validate } of compiledRules) {
|
|
770
|
+
if (validate(attributes)) {
|
|
771
|
+
merged = { ...merged, ...resolveSet(rule.set, attributes) };
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
return merged;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
/**
|
|
778
|
+
* Core document processing pipeline.
|
|
779
|
+
*
|
|
780
|
+
* Handles extracting text, computing embeddings, and syncing with the vector store.
|
|
781
|
+
*/
|
|
782
|
+
class DocumentProcessor {
|
|
783
|
+
config;
|
|
784
|
+
embeddingProvider;
|
|
785
|
+
vectorStore;
|
|
786
|
+
compiledRules;
|
|
787
|
+
logger;
|
|
788
|
+
metadataDir;
|
|
789
|
+
/**
|
|
790
|
+
* Create a new DocumentProcessor.
|
|
791
|
+
*
|
|
792
|
+
* @param config - The application configuration.
|
|
793
|
+
* @param embeddingProvider - The embedding provider.
|
|
794
|
+
* @param vectorStore - The vector store client.
|
|
795
|
+
* @param compiledRules - The compiled inference rules.
|
|
796
|
+
* @param logger - The logger instance.
|
|
797
|
+
*/
|
|
798
|
+
constructor(config, embeddingProvider, vectorStore, compiledRules, logger) {
|
|
799
|
+
this.config = config;
|
|
800
|
+
this.embeddingProvider = embeddingProvider;
|
|
801
|
+
this.vectorStore = vectorStore;
|
|
802
|
+
this.compiledRules = compiledRules;
|
|
803
|
+
this.logger = logger;
|
|
804
|
+
this.metadataDir = config.metadataDir ?? '.jeeves-metadata';
|
|
805
|
+
}
|
|
806
|
+
/**
|
|
807
|
+
* Process a file through the full pipeline: extract, hash, chunk, embed, upsert.
|
|
808
|
+
*
|
|
809
|
+
* @param filePath - The file to process.
|
|
810
|
+
*/
|
|
811
|
+
async processFile(filePath) {
|
|
812
|
+
try {
|
|
813
|
+
const ext = node_path.extname(filePath);
|
|
814
|
+
const stats = await promises.stat(filePath);
|
|
815
|
+
// 1. Extract text
|
|
816
|
+
const extracted = await extractText(filePath, ext);
|
|
817
|
+
if (!extracted.text.trim()) {
|
|
818
|
+
this.logger.debug({ filePath }, 'Skipping empty file');
|
|
819
|
+
return;
|
|
820
|
+
}
|
|
821
|
+
// 2. Content hash check — skip if unchanged
|
|
822
|
+
const hash = contentHash(extracted.text);
|
|
823
|
+
const baseId = pointId(filePath, 0);
|
|
824
|
+
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
825
|
+
if (existingPayload && existingPayload['content_hash'] === hash) {
|
|
826
|
+
this.logger.debug({ filePath }, 'Content unchanged, skipping');
|
|
827
|
+
return;
|
|
828
|
+
}
|
|
829
|
+
const oldTotalChunks = typeof existingPayload?.['total_chunks'] === 'number'
|
|
830
|
+
? existingPayload['total_chunks']
|
|
831
|
+
: 0;
|
|
832
|
+
// 3. Build attributes + apply rules → inferred metadata
|
|
833
|
+
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
834
|
+
const inferred = applyRules(this.compiledRules, attributes);
|
|
835
|
+
// 4. Read enrichment metadata (merge, enrichment wins)
|
|
836
|
+
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
837
|
+
const metadata = {
|
|
838
|
+
...inferred,
|
|
839
|
+
...(enrichment ?? {}),
|
|
840
|
+
};
|
|
841
|
+
// 5. Chunk text
|
|
842
|
+
const chunkSize = this.config.embedding.chunkSize ?? 1000;
|
|
843
|
+
const chunkOverlap = this.config.embedding.chunkOverlap ?? 200;
|
|
844
|
+
const splitter = this.createSplitter(ext, chunkSize, chunkOverlap);
|
|
845
|
+
const chunks = await splitter.splitText(extracted.text);
|
|
846
|
+
// 6. Embed all chunks
|
|
847
|
+
const vectors = await this.embeddingProvider.embed(chunks);
|
|
848
|
+
// 7. Upsert all chunk points
|
|
849
|
+
const points = chunks.map((chunk, i) => ({
|
|
850
|
+
id: pointId(filePath, i),
|
|
851
|
+
vector: vectors[i],
|
|
852
|
+
payload: {
|
|
853
|
+
...metadata,
|
|
854
|
+
file_path: filePath.replace(/\\/g, '/'),
|
|
855
|
+
chunk_index: i,
|
|
856
|
+
total_chunks: chunks.length,
|
|
857
|
+
content_hash: hash,
|
|
858
|
+
chunk_text: chunk,
|
|
859
|
+
},
|
|
860
|
+
}));
|
|
861
|
+
await this.vectorStore.upsert(points);
|
|
862
|
+
// 8. Clean up orphaned chunks
|
|
863
|
+
if (oldTotalChunks > chunks.length) {
|
|
864
|
+
const orphanIds = [];
|
|
865
|
+
for (let i = chunks.length; i < oldTotalChunks; i++) {
|
|
866
|
+
orphanIds.push(pointId(filePath, i));
|
|
867
|
+
}
|
|
868
|
+
await this.vectorStore.delete(orphanIds);
|
|
869
|
+
}
|
|
870
|
+
this.logger.info({ filePath, chunks: chunks.length }, 'File processed successfully');
|
|
871
|
+
}
|
|
872
|
+
catch (error) {
|
|
873
|
+
this.logger.error({ filePath, error }, 'Failed to process file');
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
/**
|
|
877
|
+
* Delete all chunks for a file from the vector store and remove metadata.
|
|
878
|
+
*
|
|
879
|
+
* @param filePath - The file to delete.
|
|
880
|
+
*/
|
|
881
|
+
async deleteFile(filePath) {
|
|
882
|
+
try {
|
|
883
|
+
// Get the existing payload to find total chunks
|
|
884
|
+
const baseId = pointId(filePath, 0);
|
|
885
|
+
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
886
|
+
const totalChunks = typeof existingPayload?.['total_chunks'] === 'number'
|
|
887
|
+
? existingPayload['total_chunks']
|
|
888
|
+
: 1;
|
|
889
|
+
const ids = [];
|
|
890
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
891
|
+
ids.push(pointId(filePath, i));
|
|
892
|
+
}
|
|
893
|
+
await this.vectorStore.delete(ids);
|
|
894
|
+
await deleteMetadata(filePath, this.metadataDir);
|
|
895
|
+
this.logger.info({ filePath }, 'File deleted from index');
|
|
896
|
+
}
|
|
897
|
+
catch (error) {
|
|
898
|
+
this.logger.error({ filePath, error }, 'Failed to delete file');
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
/**
|
|
902
|
+
* Process a metadata update: merge metadata, write to disk, update Qdrant payloads (no re-embed).
|
|
903
|
+
*
|
|
904
|
+
* @param filePath - The file whose metadata to update.
|
|
905
|
+
* @param metadata - The new metadata to merge.
|
|
906
|
+
* @returns The merged payload, or `null` if the file is not indexed.
|
|
907
|
+
*/
|
|
908
|
+
async processMetadataUpdate(filePath, metadata) {
|
|
909
|
+
try {
|
|
910
|
+
// Read existing enrichment metadata and merge
|
|
911
|
+
const existing = (await readMetadata(filePath, this.metadataDir)) ?? {};
|
|
912
|
+
const merged = { ...existing, ...metadata };
|
|
913
|
+
await writeMetadata(filePath, this.metadataDir, merged);
|
|
914
|
+
// Update all chunk payloads in Qdrant
|
|
915
|
+
const baseId = pointId(filePath, 0);
|
|
916
|
+
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
917
|
+
if (!existingPayload)
|
|
918
|
+
return null;
|
|
919
|
+
const totalChunks = typeof existingPayload['total_chunks'] === 'number'
|
|
920
|
+
? existingPayload['total_chunks']
|
|
921
|
+
: 1;
|
|
922
|
+
const ids = [];
|
|
923
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
924
|
+
ids.push(pointId(filePath, i));
|
|
925
|
+
}
|
|
926
|
+
await this.vectorStore.setPayload(ids, merged);
|
|
927
|
+
this.logger.info({ filePath, chunks: totalChunks }, 'Metadata updated');
|
|
928
|
+
return merged;
|
|
929
|
+
}
|
|
930
|
+
catch (error) {
|
|
931
|
+
this.logger.error({ filePath, error }, 'Failed to update metadata');
|
|
932
|
+
return null;
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
/**
|
|
936
|
+
* Re-apply inference rules to a file without re-embedding.
|
|
937
|
+
* Reads file attributes, applies current rules, merges with enrichment metadata,
|
|
938
|
+
* and updates Qdrant payloads.
|
|
939
|
+
*
|
|
940
|
+
* @param filePath - The file to update.
|
|
941
|
+
* @returns The merged metadata, or `null` if the file is not indexed.
|
|
942
|
+
*/
|
|
943
|
+
async processRulesUpdate(filePath) {
|
|
944
|
+
try {
|
|
945
|
+
const baseId = pointId(filePath, 0);
|
|
946
|
+
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
947
|
+
if (!existingPayload) {
|
|
948
|
+
this.logger.debug({ filePath }, 'File not indexed, skipping');
|
|
949
|
+
return null;
|
|
950
|
+
}
|
|
951
|
+
const ext = node_path.extname(filePath);
|
|
952
|
+
const stats = await promises.stat(filePath);
|
|
953
|
+
// Extract frontmatter/json for attribute building (lightweight)
|
|
954
|
+
const extracted = await extractText(filePath, ext);
|
|
955
|
+
// Build attributes + apply current rules
|
|
956
|
+
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
957
|
+
const inferred = applyRules(this.compiledRules, attributes);
|
|
958
|
+
// Read enrichment metadata (merge, enrichment wins)
|
|
959
|
+
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
960
|
+
const metadata = {
|
|
961
|
+
...inferred,
|
|
962
|
+
...(enrichment ?? {}),
|
|
963
|
+
};
|
|
964
|
+
// Update all chunk payloads
|
|
965
|
+
const totalChunks = typeof existingPayload['total_chunks'] === 'number'
|
|
966
|
+
? existingPayload['total_chunks']
|
|
967
|
+
: 1;
|
|
968
|
+
const ids = [];
|
|
969
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
970
|
+
ids.push(pointId(filePath, i));
|
|
971
|
+
}
|
|
972
|
+
await this.vectorStore.setPayload(ids, metadata);
|
|
973
|
+
this.logger.info({ filePath, chunks: totalChunks }, 'Rules re-applied');
|
|
974
|
+
return metadata;
|
|
975
|
+
}
|
|
976
|
+
catch (error) {
|
|
977
|
+
this.logger.error({ filePath, error }, 'Failed to re-apply rules');
|
|
978
|
+
return null;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
/**
|
|
982
|
+
* Update compiled inference rules for subsequent file processing.
|
|
983
|
+
*
|
|
984
|
+
* @param compiledRules - The newly compiled rules.
|
|
985
|
+
*/
|
|
986
|
+
updateRules(compiledRules) {
|
|
987
|
+
this.compiledRules = compiledRules;
|
|
988
|
+
this.logger.info({ rules: compiledRules.length }, 'Inference rules updated');
|
|
989
|
+
}
|
|
990
|
+
/**
|
|
991
|
+
* Create the appropriate text splitter for the given file extension.
|
|
992
|
+
*
|
|
993
|
+
* @param ext - File extension.
|
|
994
|
+
* @param chunkSize - Maximum chunk size in characters.
|
|
995
|
+
* @param chunkOverlap - Overlap between chunks in characters.
|
|
996
|
+
* @returns A text splitter instance.
|
|
997
|
+
*/
|
|
998
|
+
createSplitter(ext, chunkSize, chunkOverlap) {
|
|
999
|
+
const lowerExt = ext.toLowerCase();
|
|
1000
|
+
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1001
|
+
return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1002
|
+
}
|
|
1003
|
+
return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
/**
|
|
1008
|
+
* A debounced, rate-limited, concurrent event queue.
|
|
1009
|
+
*/
|
|
1010
|
+
class EventQueue {
|
|
1011
|
+
debounceMs;
|
|
1012
|
+
concurrency;
|
|
1013
|
+
rateLimitPerMinute;
|
|
1014
|
+
started = false;
|
|
1015
|
+
active = 0;
|
|
1016
|
+
debounceTimers = new Map();
|
|
1017
|
+
latestByKey = new Map();
|
|
1018
|
+
normalQueue = [];
|
|
1019
|
+
lowQueue = [];
|
|
1020
|
+
tokens;
|
|
1021
|
+
lastRefillMs = Date.now();
|
|
1022
|
+
drainWaiters = [];
|
|
1023
|
+
/**
|
|
1024
|
+
* Create an event queue.
|
|
1025
|
+
*
|
|
1026
|
+
* @param options - Queue options.
|
|
1027
|
+
*/
|
|
1028
|
+
constructor(options) {
|
|
1029
|
+
this.debounceMs = options.debounceMs;
|
|
1030
|
+
this.concurrency = options.concurrency;
|
|
1031
|
+
this.rateLimitPerMinute = options.rateLimitPerMinute;
|
|
1032
|
+
this.tokens = this.rateLimitPerMinute ?? Number.POSITIVE_INFINITY;
|
|
1033
|
+
}
|
|
1034
|
+
/**
|
|
1035
|
+
* Enqueue an event, debounced per path+priority.
|
|
1036
|
+
*
|
|
1037
|
+
* @param event - The watch event.
|
|
1038
|
+
* @param fn - The processing function to invoke when dequeued.
|
|
1039
|
+
*/
|
|
1040
|
+
enqueue(event, fn) {
|
|
1041
|
+
const key = `${event.priority}:${event.path}`;
|
|
1042
|
+
this.latestByKey.set(key, { event, fn });
|
|
1043
|
+
const existing = this.debounceTimers.get(key);
|
|
1044
|
+
if (existing)
|
|
1045
|
+
clearTimeout(existing);
|
|
1046
|
+
const timer = setTimeout(() => {
|
|
1047
|
+
this.debounceTimers.delete(key);
|
|
1048
|
+
const item = this.latestByKey.get(key);
|
|
1049
|
+
if (!item)
|
|
1050
|
+
return;
|
|
1051
|
+
this.latestByKey.delete(key);
|
|
1052
|
+
this.push(item);
|
|
1053
|
+
this.pump();
|
|
1054
|
+
}, this.debounceMs);
|
|
1055
|
+
this.debounceTimers.set(key, timer);
|
|
1056
|
+
}
|
|
1057
|
+
/**
|
|
1058
|
+
* Start processing events.
|
|
1059
|
+
*/
|
|
1060
|
+
process() {
|
|
1061
|
+
this.started = true;
|
|
1062
|
+
this.pump();
|
|
1063
|
+
}
|
|
1064
|
+
/**
|
|
1065
|
+
* Wait for the queue to become idle (no pending debounces, no queued items, no active work).
|
|
1066
|
+
*
|
|
1067
|
+
* @returns A promise that resolves when the queue is drained.
|
|
1068
|
+
*/
|
|
1069
|
+
async drain() {
|
|
1070
|
+
if (this.isIdle())
|
|
1071
|
+
return;
|
|
1072
|
+
await new Promise((resolve) => {
|
|
1073
|
+
this.drainWaiters.push(resolve);
|
|
1074
|
+
});
|
|
1075
|
+
}
|
|
1076
|
+
push(item) {
|
|
1077
|
+
if (item.event.priority === 'low')
|
|
1078
|
+
this.lowQueue.push(item);
|
|
1079
|
+
else
|
|
1080
|
+
this.normalQueue.push(item);
|
|
1081
|
+
}
|
|
1082
|
+
refillTokens(nowMs) {
|
|
1083
|
+
if (this.rateLimitPerMinute === undefined)
|
|
1084
|
+
return;
|
|
1085
|
+
const elapsed = Math.max(0, nowMs - this.lastRefillMs);
|
|
1086
|
+
const refillRatePerMs = this.rateLimitPerMinute / 60000;
|
|
1087
|
+
const refill = elapsed * refillRatePerMs;
|
|
1088
|
+
this.tokens = Math.min(this.rateLimitPerMinute, this.tokens + refill);
|
|
1089
|
+
this.lastRefillMs = nowMs;
|
|
1090
|
+
}
|
|
1091
|
+
takeToken() {
|
|
1092
|
+
const now = Date.now();
|
|
1093
|
+
this.refillTokens(now);
|
|
1094
|
+
if (this.tokens < 1)
|
|
1095
|
+
return false;
|
|
1096
|
+
this.tokens -= 1;
|
|
1097
|
+
return true;
|
|
1098
|
+
}
|
|
1099
|
+
nextItem() {
|
|
1100
|
+
return this.normalQueue.shift() ?? this.lowQueue.shift();
|
|
1101
|
+
}
|
|
1102
|
+
pump() {
|
|
1103
|
+
if (!this.started)
|
|
1104
|
+
return;
|
|
1105
|
+
while (this.active < this.concurrency) {
|
|
1106
|
+
const item = this.nextItem();
|
|
1107
|
+
if (!item)
|
|
1108
|
+
break;
|
|
1109
|
+
if (!this.takeToken()) {
|
|
1110
|
+
// Put it back at the front of its queue and try later.
|
|
1111
|
+
if (item.event.priority === 'low')
|
|
1112
|
+
this.lowQueue.unshift(item);
|
|
1113
|
+
else
|
|
1114
|
+
this.normalQueue.unshift(item);
|
|
1115
|
+
setTimeout(() => {
|
|
1116
|
+
this.pump();
|
|
1117
|
+
}, 250);
|
|
1118
|
+
break;
|
|
1119
|
+
}
|
|
1120
|
+
this.active += 1;
|
|
1121
|
+
void Promise.resolve()
|
|
1122
|
+
.then(() => item.fn(item.event))
|
|
1123
|
+
.finally(() => {
|
|
1124
|
+
this.active -= 1;
|
|
1125
|
+
this.pump();
|
|
1126
|
+
this.maybeResolveDrain();
|
|
1127
|
+
});
|
|
1128
|
+
}
|
|
1129
|
+
this.maybeResolveDrain();
|
|
1130
|
+
}
|
|
1131
|
+
isIdle() {
|
|
1132
|
+
return (this.active === 0 &&
|
|
1133
|
+
this.normalQueue.length === 0 &&
|
|
1134
|
+
this.lowQueue.length === 0 &&
|
|
1135
|
+
this.debounceTimers.size === 0 &&
|
|
1136
|
+
this.latestByKey.size === 0);
|
|
1137
|
+
}
|
|
1138
|
+
maybeResolveDrain() {
|
|
1139
|
+
if (!this.isIdle())
|
|
1140
|
+
return;
|
|
1141
|
+
const waiters = this.drainWaiters;
|
|
1142
|
+
this.drainWaiters = [];
|
|
1143
|
+
for (const resolve of waiters)
|
|
1144
|
+
resolve();
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
/**
|
|
1149
|
+
* Client wrapper for Qdrant vector store operations.
|
|
1150
|
+
*/
|
|
1151
|
+
class VectorStoreClient {
|
|
1152
|
+
client;
|
|
1153
|
+
collectionName;
|
|
1154
|
+
dims;
|
|
1155
|
+
/**
|
|
1156
|
+
* Create a new VectorStoreClient.
|
|
1157
|
+
*
|
|
1158
|
+
* @param config - Vector store configuration.
|
|
1159
|
+
* @param dimensions - The embedding vector dimensions.
|
|
1160
|
+
*/
|
|
1161
|
+
constructor(config, dimensions) {
|
|
1162
|
+
this.client = new jsClientRest.QdrantClient({
|
|
1163
|
+
url: config.url,
|
|
1164
|
+
apiKey: config.apiKey,
|
|
1165
|
+
});
|
|
1166
|
+
this.collectionName = config.collectionName;
|
|
1167
|
+
this.dims = dimensions;
|
|
1168
|
+
}
|
|
1169
|
+
/**
|
|
1170
|
+
* Ensure the collection exists with correct dimensions and Cosine distance.
|
|
1171
|
+
*/
|
|
1172
|
+
async ensureCollection() {
|
|
1173
|
+
try {
|
|
1174
|
+
const collections = await this.client.getCollections();
|
|
1175
|
+
const exists = collections.collections.some((c) => c.name === this.collectionName);
|
|
1176
|
+
if (!exists) {
|
|
1177
|
+
await this.client.createCollection(this.collectionName, {
|
|
1178
|
+
vectors: { size: this.dims, distance: 'Cosine' },
|
|
1179
|
+
});
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
catch (error) {
|
|
1183
|
+
throw new Error(`Failed to ensure collection "${this.collectionName}": ${String(error)}`);
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
/**
|
|
1187
|
+
* Upsert points into the collection.
|
|
1188
|
+
*
|
|
1189
|
+
* @param points - The points to upsert.
|
|
1190
|
+
*/
|
|
1191
|
+
async upsert(points) {
|
|
1192
|
+
if (points.length === 0)
|
|
1193
|
+
return;
|
|
1194
|
+
await this.client.upsert(this.collectionName, {
|
|
1195
|
+
wait: true,
|
|
1196
|
+
points: points.map((p) => ({
|
|
1197
|
+
id: p.id,
|
|
1198
|
+
vector: p.vector,
|
|
1199
|
+
payload: p.payload,
|
|
1200
|
+
})),
|
|
1201
|
+
});
|
|
1202
|
+
}
|
|
1203
|
+
/**
|
|
1204
|
+
* Delete points by their IDs.
|
|
1205
|
+
*
|
|
1206
|
+
* @param ids - The point IDs to delete.
|
|
1207
|
+
*/
|
|
1208
|
+
async delete(ids) {
|
|
1209
|
+
if (ids.length === 0)
|
|
1210
|
+
return;
|
|
1211
|
+
await this.client.delete(this.collectionName, {
|
|
1212
|
+
wait: true,
|
|
1213
|
+
points: ids,
|
|
1214
|
+
});
|
|
1215
|
+
}
|
|
1216
|
+
/**
|
|
1217
|
+
* Set payload fields for the specified point IDs.
|
|
1218
|
+
*
|
|
1219
|
+
* This merges the given payload object into each point's existing payload.
|
|
1220
|
+
*
|
|
1221
|
+
* @param ids - Point IDs to update.
|
|
1222
|
+
* @param payload - Payload fields to set.
|
|
1223
|
+
*/
|
|
1224
|
+
async setPayload(ids, payload) {
|
|
1225
|
+
if (ids.length === 0)
|
|
1226
|
+
return;
|
|
1227
|
+
await this.client.setPayload(this.collectionName, {
|
|
1228
|
+
wait: true,
|
|
1229
|
+
points: ids,
|
|
1230
|
+
payload,
|
|
1231
|
+
});
|
|
1232
|
+
}
|
|
1233
|
+
/**
|
|
1234
|
+
* Get the payload of a point by ID.
|
|
1235
|
+
*
|
|
1236
|
+
* @param id - The point ID.
|
|
1237
|
+
* @returns The payload, or `null` if the point doesn't exist.
|
|
1238
|
+
*/
|
|
1239
|
+
async getPayload(id) {
|
|
1240
|
+
try {
|
|
1241
|
+
const results = await this.client.retrieve(this.collectionName, {
|
|
1242
|
+
ids: [id],
|
|
1243
|
+
with_payload: true,
|
|
1244
|
+
with_vector: false,
|
|
1245
|
+
});
|
|
1246
|
+
if (results.length === 0)
|
|
1247
|
+
return null;
|
|
1248
|
+
return results[0].payload;
|
|
1249
|
+
}
|
|
1250
|
+
catch {
|
|
1251
|
+
return null;
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
/**
|
|
1255
|
+
* Search for similar vectors.
|
|
1256
|
+
*
|
|
1257
|
+
* @param vector - The query vector.
|
|
1258
|
+
* @param limit - Maximum results to return.
|
|
1259
|
+
* @param filter - Optional Qdrant filter.
|
|
1260
|
+
* @returns An array of search results.
|
|
1261
|
+
*/
|
|
1262
|
+
async search(vector, limit, filter) {
|
|
1263
|
+
const results = await this.client.search(this.collectionName, {
|
|
1264
|
+
vector,
|
|
1265
|
+
limit,
|
|
1266
|
+
with_payload: true,
|
|
1267
|
+
...(filter ? { filter } : {}),
|
|
1268
|
+
});
|
|
1269
|
+
return results.map((r) => ({
|
|
1270
|
+
id: String(r.id),
|
|
1271
|
+
score: r.score,
|
|
1272
|
+
payload: r.payload,
|
|
1273
|
+
}));
|
|
1274
|
+
}
|
|
1275
|
+
/**
|
|
1276
|
+
* Scroll through all points matching a filter.
|
|
1277
|
+
*
|
|
1278
|
+
* @param filter - Optional Qdrant filter.
|
|
1279
|
+
* @param limit - Page size for scrolling.
|
|
1280
|
+
* @yields Scrolled points.
|
|
1281
|
+
*/
|
|
1282
|
+
async *scroll(filter, limit = 100) {
|
|
1283
|
+
let offset = undefined;
|
|
1284
|
+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
1285
|
+
while (true) {
|
|
1286
|
+
const result = await this.client.scroll(this.collectionName, {
|
|
1287
|
+
limit,
|
|
1288
|
+
with_payload: true,
|
|
1289
|
+
with_vector: false,
|
|
1290
|
+
...(filter ? { filter } : {}),
|
|
1291
|
+
...(offset !== undefined ? { offset } : {}),
|
|
1292
|
+
});
|
|
1293
|
+
for (const point of result.points) {
|
|
1294
|
+
yield {
|
|
1295
|
+
id: String(point.id),
|
|
1296
|
+
payload: point.payload,
|
|
1297
|
+
};
|
|
1298
|
+
}
|
|
1299
|
+
const nextOffset = result.next_page_offset;
|
|
1300
|
+
if (nextOffset === null || nextOffset === undefined) {
|
|
1301
|
+
break;
|
|
1302
|
+
}
|
|
1303
|
+
if (typeof nextOffset === 'string' || typeof nextOffset === 'number') {
|
|
1304
|
+
offset = nextOffset;
|
|
1305
|
+
}
|
|
1306
|
+
else {
|
|
1307
|
+
break;
|
|
1308
|
+
}
|
|
1309
|
+
}
|
|
1310
|
+
}
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
/**
|
|
1314
|
+
* Filesystem watcher that maps chokidar events to the processing queue.
|
|
1315
|
+
*/
|
|
1316
|
+
class FileSystemWatcher {
|
|
1317
|
+
config;
|
|
1318
|
+
queue;
|
|
1319
|
+
processor;
|
|
1320
|
+
logger;
|
|
1321
|
+
watcher;
|
|
1322
|
+
/**
|
|
1323
|
+
* Create a new FileSystemWatcher.
|
|
1324
|
+
*
|
|
1325
|
+
* @param config - Watch configuration.
|
|
1326
|
+
* @param queue - The event queue.
|
|
1327
|
+
* @param processor - The document processor.
|
|
1328
|
+
* @param logger - The logger instance.
|
|
1329
|
+
*/
|
|
1330
|
+
constructor(config, queue, processor, logger) {
|
|
1331
|
+
this.config = config;
|
|
1332
|
+
this.queue = queue;
|
|
1333
|
+
this.processor = processor;
|
|
1334
|
+
this.logger = logger;
|
|
1335
|
+
}
|
|
1336
|
+
/**
|
|
1337
|
+
* Start watching the filesystem and processing events.
|
|
1338
|
+
*/
|
|
1339
|
+
start() {
|
|
1340
|
+
this.watcher = chokidar.watch(this.config.paths, {
|
|
1341
|
+
ignored: this.config.ignored,
|
|
1342
|
+
usePolling: this.config.usePolling,
|
|
1343
|
+
interval: this.config.pollIntervalMs,
|
|
1344
|
+
awaitWriteFinish: this.config.stabilityThresholdMs
|
|
1345
|
+
? { stabilityThreshold: this.config.stabilityThresholdMs }
|
|
1346
|
+
: false,
|
|
1347
|
+
ignoreInitial: false,
|
|
1348
|
+
});
|
|
1349
|
+
this.watcher.on('add', (path) => {
|
|
1350
|
+
this.logger.debug({ path }, 'File added');
|
|
1351
|
+
this.queue.enqueue({ type: 'create', path, priority: 'normal' }, () => this.processor.processFile(path));
|
|
1352
|
+
});
|
|
1353
|
+
this.watcher.on('change', (path) => {
|
|
1354
|
+
this.logger.debug({ path }, 'File changed');
|
|
1355
|
+
this.queue.enqueue({ type: 'modify', path, priority: 'normal' }, () => this.processor.processFile(path));
|
|
1356
|
+
});
|
|
1357
|
+
this.watcher.on('unlink', (path) => {
|
|
1358
|
+
this.logger.debug({ path }, 'File removed');
|
|
1359
|
+
this.queue.enqueue({ type: 'delete', path, priority: 'normal' }, () => this.processor.deleteFile(path));
|
|
1360
|
+
});
|
|
1361
|
+
this.watcher.on('error', (error) => {
|
|
1362
|
+
this.logger.error({ error }, 'Watcher error');
|
|
1363
|
+
});
|
|
1364
|
+
this.queue.process();
|
|
1365
|
+
this.logger.info({ paths: this.config.paths }, 'Filesystem watcher started');
|
|
1366
|
+
}
|
|
1367
|
+
/**
|
|
1368
|
+
* Stop the filesystem watcher.
|
|
1369
|
+
*/
|
|
1370
|
+
async stop() {
|
|
1371
|
+
if (this.watcher) {
|
|
1372
|
+
await this.watcher.close();
|
|
1373
|
+
this.watcher = undefined;
|
|
1374
|
+
this.logger.info('Filesystem watcher stopped');
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
/**
|
|
1380
|
+
* Main application class that wires together all components.
|
|
1381
|
+
*/
|
|
1382
|
+
class JeevesWatcher {
|
|
1383
|
+
config;
|
|
1384
|
+
configPath;
|
|
1385
|
+
logger;
|
|
1386
|
+
watcher;
|
|
1387
|
+
queue;
|
|
1388
|
+
server;
|
|
1389
|
+
processor;
|
|
1390
|
+
configWatcher;
|
|
1391
|
+
configDebounce;
|
|
1392
|
+
/**
|
|
1393
|
+
* Create a new JeevesWatcher instance.
|
|
1394
|
+
*
|
|
1395
|
+
* @param config - The application configuration.
|
|
1396
|
+
* @param configPath - Optional config file path to watch for changes.
|
|
1397
|
+
*/
|
|
1398
|
+
constructor(config, configPath) {
|
|
1399
|
+
this.config = config;
|
|
1400
|
+
this.configPath = configPath;
|
|
1401
|
+
}
|
|
1402
|
+
/**
|
|
1403
|
+
* Start the watcher, API server, and all components.
|
|
1404
|
+
*/
|
|
1405
|
+
async start() {
|
|
1406
|
+
const logger = createLogger(this.config.logging);
|
|
1407
|
+
this.logger = logger;
|
|
1408
|
+
let embeddingProvider;
|
|
1409
|
+
try {
|
|
1410
|
+
embeddingProvider = createEmbeddingProvider(this.config.embedding);
|
|
1411
|
+
}
|
|
1412
|
+
catch (error) {
|
|
1413
|
+
logger.fatal({ error }, 'Failed to create embedding provider');
|
|
1414
|
+
throw error;
|
|
1415
|
+
}
|
|
1416
|
+
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions);
|
|
1417
|
+
await vectorStore.ensureCollection();
|
|
1418
|
+
const compiledRules = compileRules(this.config.inferenceRules ?? []);
|
|
1419
|
+
const processor = new DocumentProcessor(this.config, embeddingProvider, vectorStore, compiledRules, logger);
|
|
1420
|
+
this.processor = processor;
|
|
1421
|
+
const queue = new EventQueue({
|
|
1422
|
+
debounceMs: this.config.watch.debounceMs ?? 2000,
|
|
1423
|
+
concurrency: this.config.embedding.concurrency ?? 5,
|
|
1424
|
+
rateLimitPerMinute: this.config.embedding.rateLimitPerMinute,
|
|
1425
|
+
});
|
|
1426
|
+
this.queue = queue;
|
|
1427
|
+
const watcher = new FileSystemWatcher(this.config.watch, queue, processor, logger);
|
|
1428
|
+
this.watcher = watcher;
|
|
1429
|
+
const server = createApiServer({
|
|
1430
|
+
processor,
|
|
1431
|
+
vectorStore,
|
|
1432
|
+
embeddingProvider,
|
|
1433
|
+
queue,
|
|
1434
|
+
config: this.config,
|
|
1435
|
+
logger,
|
|
1436
|
+
});
|
|
1437
|
+
this.server = server;
|
|
1438
|
+
await server.listen({
|
|
1439
|
+
host: this.config.api?.host ?? '127.0.0.1',
|
|
1440
|
+
port: this.config.api?.port ?? 3458,
|
|
1441
|
+
});
|
|
1442
|
+
watcher.start();
|
|
1443
|
+
this.startConfigWatch();
|
|
1444
|
+
logger.info('jeeves-watcher started');
|
|
1445
|
+
}
|
|
1446
|
+
/**
|
|
1447
|
+
* Gracefully stop all components.
|
|
1448
|
+
*/
|
|
1449
|
+
async stop() {
|
|
1450
|
+
await this.stopConfigWatch();
|
|
1451
|
+
if (this.watcher) {
|
|
1452
|
+
await this.watcher.stop();
|
|
1453
|
+
}
|
|
1454
|
+
if (this.queue) {
|
|
1455
|
+
const timeout = this.config.shutdownTimeoutMs ?? 10000;
|
|
1456
|
+
await Promise.race([
|
|
1457
|
+
this.queue.drain(),
|
|
1458
|
+
new Promise((resolve) => {
|
|
1459
|
+
setTimeout(resolve, timeout);
|
|
1460
|
+
}),
|
|
1461
|
+
]);
|
|
1462
|
+
}
|
|
1463
|
+
if (this.server) {
|
|
1464
|
+
await this.server.close();
|
|
1465
|
+
}
|
|
1466
|
+
this.logger?.info('jeeves-watcher stopped');
|
|
1467
|
+
}
|
|
1468
|
+
startConfigWatch() {
|
|
1469
|
+
const logger = this.logger;
|
|
1470
|
+
if (!logger)
|
|
1471
|
+
return;
|
|
1472
|
+
const enabled = this.config.configWatch?.enabled ?? true;
|
|
1473
|
+
if (!enabled)
|
|
1474
|
+
return;
|
|
1475
|
+
if (!this.configPath) {
|
|
1476
|
+
logger.debug('Config watch enabled, but no config path was provided');
|
|
1477
|
+
return;
|
|
1478
|
+
}
|
|
1479
|
+
const debounceMs = this.config.configWatch?.debounceMs ?? 10000;
|
|
1480
|
+
this.configWatcher = chokidar.watch(this.configPath, {
|
|
1481
|
+
ignoreInitial: true,
|
|
1482
|
+
});
|
|
1483
|
+
this.configWatcher.on('change', () => {
|
|
1484
|
+
if (this.configDebounce)
|
|
1485
|
+
clearTimeout(this.configDebounce);
|
|
1486
|
+
this.configDebounce = setTimeout(() => {
|
|
1487
|
+
void this.reloadConfig();
|
|
1488
|
+
}, debounceMs);
|
|
1489
|
+
});
|
|
1490
|
+
this.configWatcher.on('error', (error) => {
|
|
1491
|
+
logger.error({ error }, 'Config watcher error');
|
|
1492
|
+
});
|
|
1493
|
+
logger.info({ configPath: this.configPath, debounceMs }, 'Config watcher started');
|
|
1494
|
+
}
|
|
1495
|
+
async stopConfigWatch() {
|
|
1496
|
+
if (this.configDebounce) {
|
|
1497
|
+
clearTimeout(this.configDebounce);
|
|
1498
|
+
this.configDebounce = undefined;
|
|
1499
|
+
}
|
|
1500
|
+
if (this.configWatcher) {
|
|
1501
|
+
await this.configWatcher.close();
|
|
1502
|
+
this.configWatcher = undefined;
|
|
1503
|
+
}
|
|
1504
|
+
}
|
|
1505
|
+
async reloadConfig() {
|
|
1506
|
+
const logger = this.logger;
|
|
1507
|
+
const processor = this.processor;
|
|
1508
|
+
if (!logger || !processor || !this.configPath)
|
|
1509
|
+
return;
|
|
1510
|
+
try {
|
|
1511
|
+
const newConfig = await loadConfig(this.configPath);
|
|
1512
|
+
this.config = newConfig;
|
|
1513
|
+
const compiledRules = compileRules(newConfig.inferenceRules ?? []);
|
|
1514
|
+
processor.updateRules(compiledRules);
|
|
1515
|
+
logger.info({ configPath: this.configPath, rules: compiledRules.length }, 'Config reloaded');
|
|
1516
|
+
}
|
|
1517
|
+
catch (error) {
|
|
1518
|
+
logger.error({ error }, 'Failed to reload config');
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
/**
|
|
1523
|
+
* Create and start a JeevesWatcher from a config file path.
|
|
1524
|
+
*
|
|
1525
|
+
* @param configPath - Optional path to the configuration file.
|
|
1526
|
+
* @returns The running JeevesWatcher instance.
|
|
1527
|
+
*/
|
|
1528
|
+
async function startFromConfig(configPath) {
|
|
1529
|
+
const config = await loadConfig(configPath);
|
|
1530
|
+
const app = new JeevesWatcher(config, configPath);
|
|
1531
|
+
const shutdown = async () => {
|
|
1532
|
+
await app.stop();
|
|
1533
|
+
process.exit(0);
|
|
1534
|
+
};
|
|
1535
|
+
process.on('SIGTERM', () => void shutdown());
|
|
1536
|
+
process.on('SIGINT', () => void shutdown());
|
|
1537
|
+
await app.start();
|
|
1538
|
+
return app;
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
exports.DocumentProcessor = DocumentProcessor;
|
|
1542
|
+
exports.EventQueue = EventQueue;
|
|
1543
|
+
exports.FileSystemWatcher = FileSystemWatcher;
|
|
1544
|
+
exports.JeevesWatcher = JeevesWatcher;
|
|
1545
|
+
exports.VectorStoreClient = VectorStoreClient;
|
|
1546
|
+
exports.applyRules = applyRules;
|
|
1547
|
+
exports.buildAttributes = buildAttributes;
|
|
1548
|
+
exports.compileRules = compileRules;
|
|
1549
|
+
exports.contentHash = contentHash;
|
|
1550
|
+
exports.createApiServer = createApiServer;
|
|
1551
|
+
exports.createEmbeddingProvider = createEmbeddingProvider;
|
|
1552
|
+
exports.createLogger = createLogger;
|
|
1553
|
+
exports.deleteMetadata = deleteMetadata;
|
|
1554
|
+
exports.extractText = extractText;
|
|
1555
|
+
exports.loadConfig = loadConfig;
|
|
1556
|
+
exports.metadataPath = metadataPath;
|
|
1557
|
+
exports.pointId = pointId;
|
|
1558
|
+
exports.readMetadata = readMetadata;
|
|
1559
|
+
exports.startFromConfig = startFromConfig;
|
|
1560
|
+
exports.writeMetadata = writeMetadata;
|
|
1561
|
+
|
|
1562
|
+
})(this["jeeves-watcher"] = this["jeeves-watcher"] || {}, Fastify, node_crypto, promises, node_path, picomatch, chokidar, Ajv, cosmiconfig, googleGenai, pino, textsplitters, cheerio, yaml, mammoth, uuid, addFormats, jsClientRest);
|