@karmaniverous/jeeves-watcher 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -16
- package/config.schema.json +577 -0
- package/dist/cjs/index.js +800 -340
- package/dist/cli/jeeves-watcher/index.js +1130 -517
- package/dist/index.d.ts +160 -103
- package/dist/index.iife.js +796 -339
- package/dist/index.iife.min.js +1 -1
- package/dist/mjs/index.js +793 -341
- package/package.json +28 -22
package/dist/cjs/index.js
CHANGED
|
@@ -1,21 +1,24 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var Fastify = require('fastify');
|
|
4
|
+
var radash = require('radash');
|
|
4
5
|
var node_crypto = require('node:crypto');
|
|
5
6
|
var promises = require('node:fs/promises');
|
|
6
7
|
var node_path = require('node:path');
|
|
7
8
|
var picomatch = require('picomatch');
|
|
8
9
|
var chokidar = require('chokidar');
|
|
9
|
-
var Ajv = require('ajv');
|
|
10
10
|
var cosmiconfig = require('cosmiconfig');
|
|
11
|
+
var zod = require('zod');
|
|
12
|
+
var jsonmap = require('@karmaniverous/jsonmap');
|
|
11
13
|
var googleGenai = require('@langchain/google-genai');
|
|
12
14
|
var pino = require('pino');
|
|
13
|
-
var
|
|
15
|
+
var uuid = require('uuid');
|
|
14
16
|
var cheerio = require('cheerio');
|
|
15
17
|
var yaml = require('js-yaml');
|
|
16
18
|
var mammoth = require('mammoth');
|
|
17
|
-
var
|
|
19
|
+
var Ajv = require('ajv');
|
|
18
20
|
var addFormats = require('ajv-formats');
|
|
21
|
+
var textsplitters = require('@langchain/textsplitters');
|
|
19
22
|
var jsClientRest = require('@qdrant/js-client-rest');
|
|
20
23
|
|
|
21
24
|
function _interopNamespaceDefault(e) {
|
|
@@ -37,6 +40,10 @@ function _interopNamespaceDefault(e) {
|
|
|
37
40
|
|
|
38
41
|
var cheerio__namespace = /*#__PURE__*/_interopNamespaceDefault(cheerio);
|
|
39
42
|
|
|
43
|
+
/**
|
|
44
|
+
* @module metadata/metadata
|
|
45
|
+
* Persists file metadata as .meta.json. I/O: reads/writes/deletes metadata files under metadataDir. Path mapping via SHA-256 hash.
|
|
46
|
+
*/
|
|
40
47
|
/**
|
|
41
48
|
* Normalise a file path for deterministic mapping: lowercase, forward slashes, strip leading drive letter colon.
|
|
42
49
|
*
|
|
@@ -176,6 +183,30 @@ async function listFilesFromGlobs(patterns, ignored = []) {
|
|
|
176
183
|
return Array.from(seen);
|
|
177
184
|
}
|
|
178
185
|
|
|
186
|
+
/**
|
|
187
|
+
* @module processAllFiles
|
|
188
|
+
*
|
|
189
|
+
* Shared helper for processing all files matching configured globs.
|
|
190
|
+
*/
|
|
191
|
+
/**
|
|
192
|
+
* Process all files from globs using the specified processor method.
|
|
193
|
+
*
|
|
194
|
+
* @param watchPaths - The glob patterns to match.
|
|
195
|
+
* @param ignoredPaths - The glob patterns to ignore.
|
|
196
|
+
* @param processor - The document processor instance.
|
|
197
|
+
* @param method - The processor method to call ('processFile' or 'processRulesUpdate').
|
|
198
|
+
* @returns The number of files processed.
|
|
199
|
+
*/
|
|
200
|
+
async function processAllFiles(watchPaths, ignoredPaths, processor, method) {
|
|
201
|
+
const files = await listFilesFromGlobs(watchPaths, ignoredPaths);
|
|
202
|
+
for (const file of files) {
|
|
203
|
+
// Sequential on purpose to avoid surprising load.
|
|
204
|
+
// Queue integration can come later.
|
|
205
|
+
await processor[method](file);
|
|
206
|
+
}
|
|
207
|
+
return files.length;
|
|
208
|
+
}
|
|
209
|
+
|
|
179
210
|
/**
|
|
180
211
|
* Create the Fastify API server with all routes registered.
|
|
181
212
|
*
|
|
@@ -216,15 +247,8 @@ function createApiServer(options) {
|
|
|
216
247
|
});
|
|
217
248
|
app.post('/reindex', async (_request, reply) => {
|
|
218
249
|
try {
|
|
219
|
-
const
|
|
220
|
-
|
|
221
|
-
// Sequential on purpose to avoid surprising load.
|
|
222
|
-
// Queue integration can come later.
|
|
223
|
-
await processor.processFile(file);
|
|
224
|
-
}
|
|
225
|
-
return await reply
|
|
226
|
-
.status(200)
|
|
227
|
-
.send({ ok: true, filesIndexed: files.length });
|
|
250
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processFile');
|
|
251
|
+
return await reply.status(200).send({ ok: true, filesIndexed: count });
|
|
228
252
|
}
|
|
229
253
|
catch (error) {
|
|
230
254
|
logger.error({ error }, 'Reindex failed');
|
|
@@ -234,19 +258,21 @@ function createApiServer(options) {
|
|
|
234
258
|
app.post('/rebuild-metadata', async (_request, reply) => {
|
|
235
259
|
try {
|
|
236
260
|
const metadataDir = options.config.metadataDir ?? '.jeeves-metadata';
|
|
261
|
+
const SYSTEM_KEYS = [
|
|
262
|
+
'file_path',
|
|
263
|
+
'chunk_index',
|
|
264
|
+
'total_chunks',
|
|
265
|
+
'content_hash',
|
|
266
|
+
'chunk_text',
|
|
267
|
+
];
|
|
237
268
|
for await (const point of vectorStore.scroll()) {
|
|
238
269
|
const payload = point.payload;
|
|
239
270
|
const filePath = payload['file_path'];
|
|
240
271
|
if (typeof filePath !== 'string' || filePath.length === 0)
|
|
241
272
|
continue;
|
|
242
273
|
// Persist only enrichment-ish fields, not chunking/index fields.
|
|
243
|
-
const
|
|
244
|
-
|
|
245
|
-
delete rest.chunk_index;
|
|
246
|
-
delete rest.total_chunks;
|
|
247
|
-
delete rest.content_hash;
|
|
248
|
-
delete rest.chunk_text;
|
|
249
|
-
await writeMetadata(filePath, metadataDir, rest);
|
|
274
|
+
const enrichment = radash.omit(payload, SYSTEM_KEYS);
|
|
275
|
+
await writeMetadata(filePath, metadataDir, enrichment);
|
|
250
276
|
}
|
|
251
277
|
return await reply.status(200).send({ ok: true });
|
|
252
278
|
}
|
|
@@ -263,20 +289,13 @@ function createApiServer(options) {
|
|
|
263
289
|
try {
|
|
264
290
|
if (scope === 'rules') {
|
|
265
291
|
// Re-apply inference rules to all files, update Qdrant payloads (no re-embedding)
|
|
266
|
-
const
|
|
267
|
-
|
|
268
|
-
// Use the new processRulesUpdate method
|
|
269
|
-
await processor.processRulesUpdate(file);
|
|
270
|
-
}
|
|
271
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (rules) completed');
|
|
292
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processRulesUpdate');
|
|
293
|
+
logger.info({ scope, filesProcessed: count }, 'Config reindex (rules) completed');
|
|
272
294
|
}
|
|
273
295
|
else {
|
|
274
296
|
// Full reindex: re-extract, re-embed, re-upsert
|
|
275
|
-
const
|
|
276
|
-
|
|
277
|
-
await processor.processFile(file);
|
|
278
|
-
}
|
|
279
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (full) completed');
|
|
297
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processFile');
|
|
298
|
+
logger.info({ scope, filesProcessed: count }, 'Config reindex (full) completed');
|
|
280
299
|
}
|
|
281
300
|
}
|
|
282
301
|
catch (error) {
|
|
@@ -293,117 +312,249 @@ function createApiServer(options) {
|
|
|
293
312
|
return app;
|
|
294
313
|
}
|
|
295
314
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
watch: {
|
|
303
|
-
type: 'object',
|
|
304
|
-
required: ['paths'],
|
|
305
|
-
properties: {
|
|
306
|
-
paths: { type: 'array', items: { type: 'string' }, minItems: 1 },
|
|
307
|
-
ignored: { type: 'array', items: { type: 'string' } },
|
|
308
|
-
pollIntervalMs: { type: 'number' },
|
|
309
|
-
usePolling: { type: 'boolean' },
|
|
310
|
-
debounceMs: { type: 'number' },
|
|
311
|
-
stabilityThresholdMs: { type: 'number' },
|
|
312
|
-
},
|
|
313
|
-
additionalProperties: false,
|
|
314
|
-
},
|
|
315
|
-
configWatch: {
|
|
316
|
-
type: 'object',
|
|
317
|
-
properties: {
|
|
318
|
-
enabled: { type: 'boolean' },
|
|
319
|
-
debounceMs: { type: 'number' },
|
|
320
|
-
},
|
|
321
|
-
additionalProperties: false,
|
|
322
|
-
},
|
|
323
|
-
embedding: {
|
|
324
|
-
type: 'object',
|
|
325
|
-
required: ['provider', 'model'],
|
|
326
|
-
properties: {
|
|
327
|
-
provider: { type: 'string' },
|
|
328
|
-
model: { type: 'string' },
|
|
329
|
-
chunkSize: { type: 'number' },
|
|
330
|
-
chunkOverlap: { type: 'number' },
|
|
331
|
-
dimensions: { type: 'number' },
|
|
332
|
-
apiKey: { type: 'string' },
|
|
333
|
-
rateLimitPerMinute: { type: 'number' },
|
|
334
|
-
concurrency: { type: 'number' },
|
|
335
|
-
},
|
|
336
|
-
additionalProperties: false,
|
|
337
|
-
},
|
|
338
|
-
vectorStore: {
|
|
339
|
-
type: 'object',
|
|
340
|
-
required: ['url', 'collectionName'],
|
|
341
|
-
properties: {
|
|
342
|
-
url: { type: 'string' },
|
|
343
|
-
collectionName: { type: 'string' },
|
|
344
|
-
apiKey: { type: 'string' },
|
|
345
|
-
},
|
|
346
|
-
additionalProperties: false,
|
|
347
|
-
},
|
|
348
|
-
metadataDir: { type: 'string' },
|
|
349
|
-
api: {
|
|
350
|
-
type: 'object',
|
|
351
|
-
properties: {
|
|
352
|
-
host: { type: 'string' },
|
|
353
|
-
port: { type: 'number' },
|
|
354
|
-
},
|
|
355
|
-
additionalProperties: false,
|
|
356
|
-
},
|
|
357
|
-
extractors: { type: 'object' },
|
|
358
|
-
inferenceRules: {
|
|
359
|
-
type: 'array',
|
|
360
|
-
items: {
|
|
361
|
-
type: 'object',
|
|
362
|
-
required: ['match', 'set'],
|
|
363
|
-
properties: {
|
|
364
|
-
match: { type: 'object' },
|
|
365
|
-
set: { type: 'object' },
|
|
366
|
-
},
|
|
367
|
-
additionalProperties: false,
|
|
368
|
-
},
|
|
369
|
-
},
|
|
370
|
-
logging: {
|
|
371
|
-
type: 'object',
|
|
372
|
-
properties: {
|
|
373
|
-
level: { type: 'string' },
|
|
374
|
-
file: { type: 'string' },
|
|
375
|
-
},
|
|
376
|
-
additionalProperties: false,
|
|
377
|
-
},
|
|
378
|
-
shutdownTimeoutMs: { type: 'number' },
|
|
379
|
-
},
|
|
380
|
-
additionalProperties: false,
|
|
381
|
-
};
|
|
382
|
-
const ajv = new Ajv({ allErrors: true });
|
|
383
|
-
const validate = ajv.compile(configSchema);
|
|
384
|
-
/** Default values for optional configuration fields. */
|
|
385
|
-
const DEFAULTS = {
|
|
386
|
-
configWatch: { enabled: true, debounceMs: 1000 },
|
|
315
|
+
/**
|
|
316
|
+
* @module config/defaults
|
|
317
|
+
* Default configuration values for jeeves-watcher. Pure data export, no I/O or side effects.
|
|
318
|
+
*/
|
|
319
|
+
/** Default root-level config values. */
|
|
320
|
+
const ROOT_DEFAULTS = {
|
|
387
321
|
metadataDir: '.jeeves-watcher',
|
|
388
|
-
api: { host: '127.0.0.1', port: 3100 },
|
|
389
|
-
logging: { level: 'info' },
|
|
390
322
|
shutdownTimeoutMs: 10000,
|
|
391
323
|
};
|
|
392
|
-
/** Default values
|
|
324
|
+
/** Default configWatch values. */
|
|
325
|
+
const CONFIG_WATCH_DEFAULTS = {
|
|
326
|
+
enabled: true,
|
|
327
|
+
debounceMs: 1000,
|
|
328
|
+
};
|
|
329
|
+
/** Default API values. */
|
|
330
|
+
const API_DEFAULTS = {
|
|
331
|
+
host: '127.0.0.1',
|
|
332
|
+
port: 3456,
|
|
333
|
+
};
|
|
334
|
+
/** Default logging values. */
|
|
335
|
+
const LOGGING_DEFAULTS = {
|
|
336
|
+
level: 'info',
|
|
337
|
+
};
|
|
338
|
+
/** Default watch configuration. */
|
|
393
339
|
const WATCH_DEFAULTS = {
|
|
394
340
|
debounceMs: 300,
|
|
395
341
|
stabilityThresholdMs: 500,
|
|
396
342
|
usePolling: false,
|
|
397
343
|
pollIntervalMs: 1000,
|
|
398
344
|
};
|
|
399
|
-
/** Default
|
|
345
|
+
/** Default embedding configuration. */
|
|
400
346
|
const EMBEDDING_DEFAULTS = {
|
|
401
347
|
chunkSize: 1000,
|
|
402
348
|
chunkOverlap: 200,
|
|
403
|
-
dimensions:
|
|
349
|
+
dimensions: 3072,
|
|
404
350
|
rateLimitPerMinute: 300,
|
|
405
351
|
concurrency: 5,
|
|
406
352
|
};
|
|
353
|
+
|
|
354
|
+
/**
|
|
355
|
+
* Watch configuration for file system monitoring.
|
|
356
|
+
*/
|
|
357
|
+
const watchConfigSchema = zod.z.object({
|
|
358
|
+
/** Glob patterns to watch. */
|
|
359
|
+
paths: zod.z
|
|
360
|
+
.array(zod.z.string())
|
|
361
|
+
.min(1)
|
|
362
|
+
.describe('Glob patterns for files to watch (e.g., "**/*.md"). At least one required.'),
|
|
363
|
+
/** Glob patterns to ignore. */
|
|
364
|
+
ignored: zod.z
|
|
365
|
+
.array(zod.z.string())
|
|
366
|
+
.optional()
|
|
367
|
+
.describe('Glob patterns to exclude from watching (e.g., "**/node_modules/**").'),
|
|
368
|
+
/** Polling interval in milliseconds. */
|
|
369
|
+
pollIntervalMs: zod.z
|
|
370
|
+
.number()
|
|
371
|
+
.optional()
|
|
372
|
+
.describe('Polling interval in milliseconds when usePolling is enabled.'),
|
|
373
|
+
/** Whether to use polling instead of native watchers. */
|
|
374
|
+
usePolling: zod.z
|
|
375
|
+
.boolean()
|
|
376
|
+
.optional()
|
|
377
|
+
.describe('Use polling instead of native file system events (for network drives).'),
|
|
378
|
+
/** Debounce delay in milliseconds for file change events. */
|
|
379
|
+
debounceMs: zod.z
|
|
380
|
+
.number()
|
|
381
|
+
.optional()
|
|
382
|
+
.describe('Debounce delay in milliseconds for file change events.'),
|
|
383
|
+
/** Time in milliseconds a file must be stable before processing. */
|
|
384
|
+
stabilityThresholdMs: zod.z
|
|
385
|
+
.number()
|
|
386
|
+
.optional()
|
|
387
|
+
.describe('Time in milliseconds a file must remain unchanged before processing.'),
|
|
388
|
+
});
|
|
389
|
+
/**
|
|
390
|
+
* Configuration watch settings.
|
|
391
|
+
*/
|
|
392
|
+
const configWatchConfigSchema = zod.z.object({
|
|
393
|
+
/** Whether config file watching is enabled. */
|
|
394
|
+
enabled: zod.z
|
|
395
|
+
.boolean()
|
|
396
|
+
.optional()
|
|
397
|
+
.describe('Enable automatic reloading when config file changes.'),
|
|
398
|
+
/** Debounce delay in milliseconds for config change events. */
|
|
399
|
+
debounceMs: zod.z
|
|
400
|
+
.number()
|
|
401
|
+
.optional()
|
|
402
|
+
.describe('Debounce delay in milliseconds for config file change detection.'),
|
|
403
|
+
});
|
|
404
|
+
/**
|
|
405
|
+
* Embedding model configuration.
|
|
406
|
+
*/
|
|
407
|
+
const embeddingConfigSchema = zod.z.object({
|
|
408
|
+
/** The embedding model provider. */
|
|
409
|
+
provider: zod.z
|
|
410
|
+
.string()
|
|
411
|
+
.default('gemini')
|
|
412
|
+
.describe('Embedding provider name (e.g., "gemini", "openai").'),
|
|
413
|
+
/** The embedding model name. */
|
|
414
|
+
model: zod.z
|
|
415
|
+
.string()
|
|
416
|
+
.default('gemini-embedding-001')
|
|
417
|
+
.describe('Embedding model identifier (e.g., "gemini-embedding-001", "text-embedding-3-small").'),
|
|
418
|
+
/** Maximum tokens per chunk for splitting. */
|
|
419
|
+
chunkSize: zod.z
|
|
420
|
+
.number()
|
|
421
|
+
.optional()
|
|
422
|
+
.describe('Maximum chunk size in characters for text splitting.'),
|
|
423
|
+
/** Overlap between chunks in tokens. */
|
|
424
|
+
chunkOverlap: zod.z
|
|
425
|
+
.number()
|
|
426
|
+
.optional()
|
|
427
|
+
.describe('Character overlap between consecutive chunks.'),
|
|
428
|
+
/** Embedding vector dimensions. */
|
|
429
|
+
dimensions: zod.z
|
|
430
|
+
.number()
|
|
431
|
+
.optional()
|
|
432
|
+
.describe('Embedding vector dimensions (must match model output).'),
|
|
433
|
+
/** API key for the embedding provider. */
|
|
434
|
+
apiKey: zod.z
|
|
435
|
+
.string()
|
|
436
|
+
.optional()
|
|
437
|
+
.describe('API key for embedding provider (supports ${ENV_VAR} substitution).'),
|
|
438
|
+
/** Maximum embedding requests per minute. */
|
|
439
|
+
rateLimitPerMinute: zod.z
|
|
440
|
+
.number()
|
|
441
|
+
.optional()
|
|
442
|
+
.describe('Maximum embedding API requests per minute (rate limiting).'),
|
|
443
|
+
/** Maximum concurrent embedding requests. */
|
|
444
|
+
concurrency: zod.z
|
|
445
|
+
.number()
|
|
446
|
+
.optional()
|
|
447
|
+
.describe('Maximum concurrent embedding requests.'),
|
|
448
|
+
});
|
|
449
|
+
/**
|
|
450
|
+
* Vector store configuration for Qdrant.
|
|
451
|
+
*/
|
|
452
|
+
const vectorStoreConfigSchema = zod.z.object({
|
|
453
|
+
/** Qdrant server URL. */
|
|
454
|
+
url: zod.z
|
|
455
|
+
.string()
|
|
456
|
+
.describe('Qdrant server URL (e.g., "http://localhost:6333").'),
|
|
457
|
+
/** Qdrant collection name. */
|
|
458
|
+
collectionName: zod.z
|
|
459
|
+
.string()
|
|
460
|
+
.describe('Qdrant collection name for vector storage.'),
|
|
461
|
+
/** Qdrant API key. */
|
|
462
|
+
apiKey: zod.z
|
|
463
|
+
.string()
|
|
464
|
+
.optional()
|
|
465
|
+
.describe('Qdrant API key for authentication (supports ${ENV_VAR} substitution).'),
|
|
466
|
+
});
|
|
467
|
+
/**
|
|
468
|
+
* API server configuration.
|
|
469
|
+
*/
|
|
470
|
+
const apiConfigSchema = zod.z.object({
|
|
471
|
+
/** Host to bind to. */
|
|
472
|
+
host: zod.z
|
|
473
|
+
.string()
|
|
474
|
+
.optional()
|
|
475
|
+
.describe('Host address for API server (e.g., "127.0.0.1", "0.0.0.0").'),
|
|
476
|
+
/** Port to listen on. */
|
|
477
|
+
port: zod.z.number().optional().describe('Port for API server (e.g., 3456).'),
|
|
478
|
+
});
|
|
479
|
+
/**
|
|
480
|
+
* Logging configuration.
|
|
481
|
+
*/
|
|
482
|
+
const loggingConfigSchema = zod.z.object({
|
|
483
|
+
/** Log level. */
|
|
484
|
+
level: zod.z
|
|
485
|
+
.string()
|
|
486
|
+
.optional()
|
|
487
|
+
.describe('Logging level (trace, debug, info, warn, error, fatal).'),
|
|
488
|
+
/** Log file path. */
|
|
489
|
+
file: zod.z
|
|
490
|
+
.string()
|
|
491
|
+
.optional()
|
|
492
|
+
.describe('Path to log file (logs to stdout if omitted).'),
|
|
493
|
+
});
|
|
494
|
+
/**
|
|
495
|
+
* An inference rule that enriches document metadata.
|
|
496
|
+
*/
|
|
497
|
+
const inferenceRuleSchema = zod.z.object({
|
|
498
|
+
/** JSON Schema object to match against document metadata. */
|
|
499
|
+
match: zod.z
|
|
500
|
+
.record(zod.z.string(), zod.z.unknown())
|
|
501
|
+
.describe('JSON Schema object to match against file attributes.'),
|
|
502
|
+
/** Metadata fields to set when the rule matches. */
|
|
503
|
+
set: zod.z
|
|
504
|
+
.record(zod.z.string(), zod.z.unknown())
|
|
505
|
+
.describe('Metadata fields to set when match succeeds.'),
|
|
506
|
+
/** JsonMap transformation (inline or reference to named map). */
|
|
507
|
+
map: zod.z
|
|
508
|
+
.union([jsonmap.jsonMapMapSchema, zod.z.string()])
|
|
509
|
+
.optional()
|
|
510
|
+
.describe('JsonMap transformation (inline definition or named map reference).'),
|
|
511
|
+
});
|
|
512
|
+
/**
|
|
513
|
+
* Top-level configuration for jeeves-watcher.
|
|
514
|
+
*/
|
|
515
|
+
const jeevesWatcherConfigSchema = zod.z.object({
|
|
516
|
+
/** File system watch configuration. */
|
|
517
|
+
watch: watchConfigSchema.describe('File system watch configuration.'),
|
|
518
|
+
/** Configuration file watch settings. */
|
|
519
|
+
configWatch: configWatchConfigSchema
|
|
520
|
+
.optional()
|
|
521
|
+
.describe('Configuration file watch settings.'),
|
|
522
|
+
/** Embedding model configuration. */
|
|
523
|
+
embedding: embeddingConfigSchema.describe('Embedding model configuration.'),
|
|
524
|
+
/** Vector store configuration. */
|
|
525
|
+
vectorStore: vectorStoreConfigSchema.describe('Qdrant vector store configuration.'),
|
|
526
|
+
/** Directory for persisted metadata. */
|
|
527
|
+
metadataDir: zod.z
|
|
528
|
+
.string()
|
|
529
|
+
.optional()
|
|
530
|
+
.describe('Directory for persisted metadata sidecar files.'),
|
|
531
|
+
/** API server configuration. */
|
|
532
|
+
api: apiConfigSchema.optional().describe('API server configuration.'),
|
|
533
|
+
/** Extractor configurations keyed by name. */
|
|
534
|
+
extractors: zod.z
|
|
535
|
+
.record(zod.z.string(), zod.z.unknown())
|
|
536
|
+
.optional()
|
|
537
|
+
.describe('Extractor configurations keyed by name.'),
|
|
538
|
+
/** Rules for inferring metadata from document properties. */
|
|
539
|
+
inferenceRules: zod.z
|
|
540
|
+
.array(inferenceRuleSchema)
|
|
541
|
+
.optional()
|
|
542
|
+
.describe('Rules for inferring metadata from file attributes.'),
|
|
543
|
+
/** Reusable named JsonMap transformations. */
|
|
544
|
+
maps: zod.z
|
|
545
|
+
.record(zod.z.string(), jsonmap.jsonMapMapSchema)
|
|
546
|
+
.optional()
|
|
547
|
+
.describe('Reusable named JsonMap transformations.'),
|
|
548
|
+
/** Logging configuration. */
|
|
549
|
+
logging: loggingConfigSchema.optional().describe('Logging configuration.'),
|
|
550
|
+
/** Timeout in milliseconds for graceful shutdown. */
|
|
551
|
+
shutdownTimeoutMs: zod.z
|
|
552
|
+
.number()
|
|
553
|
+
.optional()
|
|
554
|
+
.describe('Timeout in milliseconds for graceful shutdown.'),
|
|
555
|
+
});
|
|
556
|
+
|
|
557
|
+
const MODULE_NAME = 'jeeves-watcher';
|
|
407
558
|
/**
|
|
408
559
|
* Merge sensible defaults into a loaded configuration.
|
|
409
560
|
*
|
|
@@ -412,13 +563,13 @@ const EMBEDDING_DEFAULTS = {
|
|
|
412
563
|
*/
|
|
413
564
|
function applyDefaults(raw) {
|
|
414
565
|
return {
|
|
415
|
-
...
|
|
566
|
+
...ROOT_DEFAULTS,
|
|
416
567
|
...raw,
|
|
417
568
|
watch: { ...WATCH_DEFAULTS, ...raw.watch },
|
|
418
|
-
configWatch: { ...
|
|
569
|
+
configWatch: { ...CONFIG_WATCH_DEFAULTS, ...raw.configWatch },
|
|
419
570
|
embedding: { ...EMBEDDING_DEFAULTS, ...raw.embedding },
|
|
420
|
-
api: { ...
|
|
421
|
-
logging: { ...
|
|
571
|
+
api: { ...API_DEFAULTS, ...raw.api },
|
|
572
|
+
logging: { ...LOGGING_DEFAULTS, ...raw.logging },
|
|
422
573
|
};
|
|
423
574
|
}
|
|
424
575
|
/**
|
|
@@ -436,21 +587,89 @@ async function loadConfig(configPath) {
|
|
|
436
587
|
if (!result || result.isEmpty) {
|
|
437
588
|
throw new Error('No jeeves-watcher configuration found. Create a .jeeves-watcherrc or jeeves-watcher.config.{js,ts,json,yaml} file.');
|
|
438
589
|
}
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
590
|
+
try {
|
|
591
|
+
const validated = jeevesWatcherConfigSchema.parse(result.config);
|
|
592
|
+
return applyDefaults(validated);
|
|
593
|
+
}
|
|
594
|
+
catch (error) {
|
|
595
|
+
if (error instanceof zod.ZodError) {
|
|
596
|
+
const errors = error.issues
|
|
597
|
+
.map((issue) => `${issue.path.join('.')}: ${issue.message}`)
|
|
598
|
+
.join('; ');
|
|
599
|
+
throw new Error(`Invalid jeeves-watcher configuration: ${errors}`);
|
|
600
|
+
}
|
|
601
|
+
throw error;
|
|
450
602
|
}
|
|
451
|
-
return applyDefaults(raw);
|
|
452
603
|
}
|
|
453
604
|
|
|
605
|
+
/**
|
|
606
|
+
* @module util/retry
|
|
607
|
+
* Small async retry helper with exponential backoff. Side effects: sleeps between attempts; can invoke onRetry callback for logging.
|
|
608
|
+
*/
|
|
609
|
+
function sleep(ms, signal) {
|
|
610
|
+
if (ms <= 0)
|
|
611
|
+
return Promise.resolve();
|
|
612
|
+
return new Promise((resolve, reject) => {
|
|
613
|
+
const timer = setTimeout(() => {
|
|
614
|
+
cleanup();
|
|
615
|
+
resolve();
|
|
616
|
+
}, ms);
|
|
617
|
+
const onAbort = () => {
|
|
618
|
+
cleanup();
|
|
619
|
+
reject(new Error('Retry sleep aborted'));
|
|
620
|
+
};
|
|
621
|
+
const cleanup = () => {
|
|
622
|
+
clearTimeout(timer);
|
|
623
|
+
if (signal)
|
|
624
|
+
signal.removeEventListener('abort', onAbort);
|
|
625
|
+
};
|
|
626
|
+
if (signal) {
|
|
627
|
+
if (signal.aborted) {
|
|
628
|
+
onAbort();
|
|
629
|
+
return;
|
|
630
|
+
}
|
|
631
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
632
|
+
}
|
|
633
|
+
});
|
|
634
|
+
}
|
|
635
|
+
function computeDelayMs(attempt, baseDelayMs, maxDelayMs, jitter = 0) {
|
|
636
|
+
const exp = Math.max(0, attempt - 1);
|
|
637
|
+
const raw = Math.min(maxDelayMs, baseDelayMs * 2 ** exp);
|
|
638
|
+
const factor = jitter > 0 ? 1 + Math.random() * jitter : 1;
|
|
639
|
+
return Math.round(raw * factor);
|
|
640
|
+
}
|
|
641
|
+
/**
|
|
642
|
+
* Retry an async operation using exponential backoff.
|
|
643
|
+
*
|
|
644
|
+
* @param fn - Operation to execute.
|
|
645
|
+
* @param options - Retry policy.
|
|
646
|
+
* @returns The operation result.
|
|
647
|
+
*/
|
|
648
|
+
async function retry(fn, options) {
|
|
649
|
+
const attempts = Math.max(1, options.attempts);
|
|
650
|
+
let lastError;
|
|
651
|
+
for (let attempt = 1; attempt <= attempts; attempt++) {
|
|
652
|
+
try {
|
|
653
|
+
return await fn(attempt);
|
|
654
|
+
}
|
|
655
|
+
catch (error) {
|
|
656
|
+
lastError = error;
|
|
657
|
+
const isLast = attempt >= attempts;
|
|
658
|
+
if (isLast)
|
|
659
|
+
break;
|
|
660
|
+
const delayMs = computeDelayMs(attempt, options.baseDelayMs, options.maxDelayMs, options.jitter);
|
|
661
|
+
options.onRetry?.({ attempt, attempts, delayMs, error });
|
|
662
|
+
await sleep(delayMs, options.signal);
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
throw lastError;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
/**
|
|
669
|
+
* @module embedding
|
|
670
|
+
*
|
|
671
|
+
* Embedding provider abstractions and registry-backed factory.
|
|
672
|
+
*/
|
|
454
673
|
/**
|
|
455
674
|
* Create a mock embedding provider that generates deterministic vectors from content hashes.
|
|
456
675
|
*
|
|
@@ -478,10 +697,11 @@ function createMockProvider(dimensions) {
|
|
|
478
697
|
* Create a Gemini embedding provider using the Google Generative AI SDK.
|
|
479
698
|
*
|
|
480
699
|
* @param config - The embedding configuration.
|
|
700
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
481
701
|
* @returns A Gemini {@link EmbeddingProvider}.
|
|
482
702
|
* @throws If the API key is missing.
|
|
483
703
|
*/
|
|
484
|
-
function createGeminiProvider(config) {
|
|
704
|
+
function createGeminiProvider(config, logger) {
|
|
485
705
|
if (!config.apiKey) {
|
|
486
706
|
throw new Error('Gemini embedding provider requires config.embedding.apiKey');
|
|
487
707
|
}
|
|
@@ -493,8 +713,43 @@ function createGeminiProvider(config) {
|
|
|
493
713
|
return {
|
|
494
714
|
dimensions,
|
|
495
715
|
async embed(texts) {
|
|
496
|
-
|
|
497
|
-
|
|
716
|
+
const vectors = await retry(async (attempt) => {
|
|
717
|
+
if (attempt > 1) {
|
|
718
|
+
const msg = {
|
|
719
|
+
attempt,
|
|
720
|
+
provider: 'gemini',
|
|
721
|
+
model: config.model,
|
|
722
|
+
};
|
|
723
|
+
if (logger) {
|
|
724
|
+
logger.warn(msg, 'Retrying embedding request');
|
|
725
|
+
}
|
|
726
|
+
else {
|
|
727
|
+
console.warn(msg, 'Retrying embedding request');
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
// embedDocuments returns vectors for multiple texts
|
|
731
|
+
return embedder.embedDocuments(texts);
|
|
732
|
+
}, {
|
|
733
|
+
attempts: 5,
|
|
734
|
+
baseDelayMs: 500,
|
|
735
|
+
maxDelayMs: 10_000,
|
|
736
|
+
jitter: 0.2,
|
|
737
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
738
|
+
const msg = {
|
|
739
|
+
attempt,
|
|
740
|
+
delayMs,
|
|
741
|
+
provider: 'gemini',
|
|
742
|
+
model: config.model,
|
|
743
|
+
error,
|
|
744
|
+
};
|
|
745
|
+
if (logger) {
|
|
746
|
+
logger.warn(msg, 'Embedding call failed; will retry');
|
|
747
|
+
}
|
|
748
|
+
else {
|
|
749
|
+
console.warn(msg, 'Embedding call failed; will retry');
|
|
750
|
+
}
|
|
751
|
+
},
|
|
752
|
+
});
|
|
498
753
|
// Validate dimensions
|
|
499
754
|
for (const vector of vectors) {
|
|
500
755
|
if (vector.length !== dimensions) {
|
|
@@ -505,25 +760,36 @@ function createGeminiProvider(config) {
|
|
|
505
760
|
},
|
|
506
761
|
};
|
|
507
762
|
}
|
|
763
|
+
function createMockFromConfig(config) {
|
|
764
|
+
const dimensions = config.dimensions ?? 768;
|
|
765
|
+
return createMockProvider(dimensions);
|
|
766
|
+
}
|
|
767
|
+
const embeddingProviderRegistry = new Map([
|
|
768
|
+
['mock', createMockFromConfig],
|
|
769
|
+
['gemini', createGeminiProvider],
|
|
770
|
+
]);
|
|
508
771
|
/**
|
|
509
772
|
* Create an embedding provider based on the given configuration.
|
|
510
773
|
*
|
|
774
|
+
* Each provider is responsible for its own default dimensions.
|
|
775
|
+
*
|
|
511
776
|
* @param config - The embedding configuration.
|
|
777
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
512
778
|
* @returns An {@link EmbeddingProvider} instance.
|
|
513
779
|
* @throws If the configured provider is not supported.
|
|
514
780
|
*/
|
|
515
|
-
function createEmbeddingProvider(config) {
|
|
516
|
-
const
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
return createMockProvider(dimensions);
|
|
520
|
-
case 'gemini':
|
|
521
|
-
return createGeminiProvider(config);
|
|
522
|
-
default:
|
|
523
|
-
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
781
|
+
function createEmbeddingProvider(config, logger) {
|
|
782
|
+
const factory = embeddingProviderRegistry.get(config.provider);
|
|
783
|
+
if (!factory) {
|
|
784
|
+
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
524
785
|
}
|
|
786
|
+
return factory(config, logger);
|
|
525
787
|
}
|
|
526
788
|
|
|
789
|
+
/**
|
|
790
|
+
* @module logger
|
|
791
|
+
* Creates pino logger instances. I/O: optionally writes logs to file via pino/file transport. Defaults to stdout at info level.
|
|
792
|
+
*/
|
|
527
793
|
/**
|
|
528
794
|
* Create a pino logger instance.
|
|
529
795
|
*
|
|
@@ -542,6 +808,54 @@ function createLogger(config) {
|
|
|
542
808
|
return pino({ level });
|
|
543
809
|
}
|
|
544
810
|
|
|
811
|
+
/**
|
|
812
|
+
* @module hash
|
|
813
|
+
* Provides SHA-256 content hashing. Pure function: given text string, returns hex digest. No I/O or side effects.
|
|
814
|
+
*/
|
|
815
|
+
/**
|
|
816
|
+
* Compute a SHA-256 hex digest of the given text.
|
|
817
|
+
*
|
|
818
|
+
* @param text - The input text to hash.
|
|
819
|
+
* @returns The hex-encoded SHA-256 hash.
|
|
820
|
+
*/
|
|
821
|
+
function contentHash(text) {
|
|
822
|
+
return node_crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
/**
|
|
826
|
+
* @module pointId
|
|
827
|
+
* Generates deterministic UUIDv5 point IDs for file paths and chunk indices. Pure function: normalizes paths, returns stable IDs. No I/O.
|
|
828
|
+
*/
|
|
829
|
+
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
830
|
+
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
831
|
+
/**
|
|
832
|
+
* Normalise a file path for deterministic point ID generation.
|
|
833
|
+
*
|
|
834
|
+
* @param filePath - The original file path.
|
|
835
|
+
* @returns The normalised path string.
|
|
836
|
+
*/
|
|
837
|
+
function normalisePath(filePath) {
|
|
838
|
+
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
839
|
+
}
|
|
840
|
+
/**
|
|
841
|
+
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
842
|
+
*
|
|
843
|
+
* @param filePath - The file path.
|
|
844
|
+
* @param chunkIndex - Optional chunk index within the file.
|
|
845
|
+
* @returns A deterministic UUID v5 string.
|
|
846
|
+
*/
|
|
847
|
+
function pointId(filePath, chunkIndex) {
|
|
848
|
+
const key = chunkIndex !== undefined
|
|
849
|
+
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
850
|
+
: normalisePath(filePath);
|
|
851
|
+
return uuid.v5(key, NAMESPACE);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
/**
|
|
855
|
+
* @module extractors
|
|
856
|
+
*
|
|
857
|
+
* Text extraction registry for supported file formats.
|
|
858
|
+
*/
|
|
545
859
|
/**
|
|
546
860
|
* Extract YAML frontmatter from a Markdown document.
|
|
547
861
|
*
|
|
@@ -587,6 +901,55 @@ function extractJsonText(obj) {
|
|
|
587
901
|
}
|
|
588
902
|
return JSON.stringify(obj);
|
|
589
903
|
}
|
|
904
|
+
async function extractMarkdown(filePath) {
|
|
905
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
906
|
+
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
907
|
+
return { text: body, frontmatter };
|
|
908
|
+
}
|
|
909
|
+
async function extractPlaintext(filePath) {
|
|
910
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
911
|
+
return { text: raw };
|
|
912
|
+
}
|
|
913
|
+
async function extractJson(filePath) {
|
|
914
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
915
|
+
const parsed = JSON.parse(raw);
|
|
916
|
+
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
917
|
+
? parsed
|
|
918
|
+
: undefined;
|
|
919
|
+
return { text: extractJsonText(parsed), json };
|
|
920
|
+
}
|
|
921
|
+
async function extractPdf(filePath) {
|
|
922
|
+
const buffer = await promises.readFile(filePath);
|
|
923
|
+
const uint8Array = new Uint8Array(buffer);
|
|
924
|
+
const { extractText: extractPdfText } = await import('unpdf');
|
|
925
|
+
const { text } = await extractPdfText(uint8Array);
|
|
926
|
+
// unpdf returns an array of strings (one per page)
|
|
927
|
+
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
928
|
+
return { text: content };
|
|
929
|
+
}
|
|
930
|
+
async function extractDocx(filePath) {
|
|
931
|
+
const buffer = await promises.readFile(filePath);
|
|
932
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
933
|
+
return { text: result.value };
|
|
934
|
+
}
|
|
935
|
+
async function extractHtml(filePath) {
|
|
936
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
937
|
+
const $ = cheerio__namespace.load(raw);
|
|
938
|
+
$('script, style').remove();
|
|
939
|
+
const text = $('body').text().trim() || $.text().trim();
|
|
940
|
+
return { text };
|
|
941
|
+
}
|
|
942
|
+
const extractorRegistry = new Map([
|
|
943
|
+
['.md', extractMarkdown],
|
|
944
|
+
['.markdown', extractMarkdown],
|
|
945
|
+
['.txt', extractPlaintext],
|
|
946
|
+
['.text', extractPlaintext],
|
|
947
|
+
['.json', extractJson],
|
|
948
|
+
['.pdf', extractPdf],
|
|
949
|
+
['.docx', extractDocx],
|
|
950
|
+
['.html', extractHtml],
|
|
951
|
+
['.htm', extractHtml],
|
|
952
|
+
]);
|
|
590
953
|
/**
|
|
591
954
|
* Extract text from a file based on extension.
|
|
592
955
|
*
|
|
@@ -595,85 +958,11 @@ function extractJsonText(obj) {
|
|
|
595
958
|
* @returns Extracted text and optional structured data.
|
|
596
959
|
*/
|
|
597
960
|
async function extractText(filePath, extension) {
|
|
598
|
-
const
|
|
599
|
-
if (
|
|
600
|
-
|
|
601
|
-
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
602
|
-
return { text: body, frontmatter };
|
|
603
|
-
}
|
|
604
|
-
if (ext === '.txt' || ext === '.text') {
|
|
605
|
-
const raw = await promises.readFile(filePath, 'utf8');
|
|
606
|
-
return { text: raw };
|
|
607
|
-
}
|
|
608
|
-
if (ext === '.json') {
|
|
609
|
-
const raw = await promises.readFile(filePath, 'utf8');
|
|
610
|
-
const parsed = JSON.parse(raw);
|
|
611
|
-
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
612
|
-
? parsed
|
|
613
|
-
: undefined;
|
|
614
|
-
return { text: extractJsonText(parsed), json };
|
|
615
|
-
}
|
|
616
|
-
if (ext === '.pdf') {
|
|
617
|
-
const buffer = await promises.readFile(filePath);
|
|
618
|
-
const uint8Array = new Uint8Array(buffer);
|
|
619
|
-
const { extractText: extractPdfText } = await import('unpdf');
|
|
620
|
-
const { text } = await extractPdfText(uint8Array);
|
|
621
|
-
// unpdf returns an array of strings (one per page)
|
|
622
|
-
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
623
|
-
return { text: content };
|
|
624
|
-
}
|
|
625
|
-
if (ext === '.docx') {
|
|
626
|
-
const buffer = await promises.readFile(filePath);
|
|
627
|
-
const result = await mammoth.extractRawText({ buffer });
|
|
628
|
-
return { text: result.value };
|
|
629
|
-
}
|
|
630
|
-
if (ext === '.html' || ext === '.htm') {
|
|
631
|
-
const raw = await promises.readFile(filePath, 'utf8');
|
|
632
|
-
const $ = cheerio__namespace.load(raw);
|
|
633
|
-
// Remove script and style elements
|
|
634
|
-
$('script, style').remove();
|
|
635
|
-
// Extract text content
|
|
636
|
-
const text = $('body').text().trim() || $.text().trim();
|
|
637
|
-
return { text };
|
|
638
|
-
}
|
|
961
|
+
const extractor = extractorRegistry.get(extension.toLowerCase());
|
|
962
|
+
if (extractor)
|
|
963
|
+
return extractor(filePath);
|
|
639
964
|
// Default: treat as plaintext.
|
|
640
|
-
|
|
641
|
-
return { text: raw };
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
/**
|
|
645
|
-
* Compute a SHA-256 hex digest of the given text.
|
|
646
|
-
*
|
|
647
|
-
* @param text - The input text to hash.
|
|
648
|
-
* @returns The hex-encoded SHA-256 hash.
|
|
649
|
-
*/
|
|
650
|
-
function contentHash(text) {
|
|
651
|
-
return node_crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
652
|
-
}
|
|
653
|
-
|
|
654
|
-
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
655
|
-
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
656
|
-
/**
|
|
657
|
-
* Normalise a file path for deterministic point ID generation.
|
|
658
|
-
*
|
|
659
|
-
* @param filePath - The original file path.
|
|
660
|
-
* @returns The normalised path string.
|
|
661
|
-
*/
|
|
662
|
-
function normalisePath(filePath) {
|
|
663
|
-
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
664
|
-
}
|
|
665
|
-
/**
|
|
666
|
-
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
667
|
-
*
|
|
668
|
-
* @param filePath - The file path.
|
|
669
|
-
* @param chunkIndex - Optional chunk index within the file.
|
|
670
|
-
* @returns A deterministic UUID v5 string.
|
|
671
|
-
*/
|
|
672
|
-
function pointId(filePath, chunkIndex) {
|
|
673
|
-
const key = chunkIndex !== undefined
|
|
674
|
-
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
675
|
-
: normalisePath(filePath);
|
|
676
|
-
return uuid.v5(key, NAMESPACE);
|
|
965
|
+
return extractPlaintext(filePath);
|
|
677
966
|
}
|
|
678
967
|
|
|
679
968
|
/**
|
|
@@ -746,13 +1035,7 @@ function resolveTemplateVars(value, attributes) {
|
|
|
746
1035
|
if (typeof value !== 'string')
|
|
747
1036
|
return value;
|
|
748
1037
|
return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
|
|
749
|
-
const
|
|
750
|
-
let current = attributes;
|
|
751
|
-
for (const part of parts) {
|
|
752
|
-
if (current === null || current === undefined)
|
|
753
|
-
return '';
|
|
754
|
-
current = current[part];
|
|
755
|
-
}
|
|
1038
|
+
const current = radash.get(attributes, varPath);
|
|
756
1039
|
if (current === null || current === undefined)
|
|
757
1040
|
return '';
|
|
758
1041
|
return typeof current === 'string' ? current : JSON.stringify(current);
|
|
@@ -772,25 +1055,170 @@ function resolveSet(setObj, attributes) {
|
|
|
772
1055
|
}
|
|
773
1056
|
return result;
|
|
774
1057
|
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Create the lib object for JsonMap transformations.
|
|
1060
|
+
* Provides utility functions for path manipulation.
|
|
1061
|
+
*
|
|
1062
|
+
* @returns The lib object.
|
|
1063
|
+
*/
|
|
1064
|
+
function createJsonMapLib() {
|
|
1065
|
+
return {
|
|
1066
|
+
split: (str, separator) => str.split(separator),
|
|
1067
|
+
slice: (arr, start, end) => arr.slice(start, end),
|
|
1068
|
+
join: (arr, separator) => arr.join(separator),
|
|
1069
|
+
toLowerCase: (str) => str.toLowerCase(),
|
|
1070
|
+
replace: (str, search, replacement) => str.replace(search, replacement),
|
|
1071
|
+
get: (obj, path) => radash.get(obj, path),
|
|
1072
|
+
};
|
|
1073
|
+
}
|
|
775
1074
|
/**
|
|
776
1075
|
* Apply compiled inference rules to file attributes, returning merged metadata.
|
|
777
1076
|
*
|
|
778
1077
|
* Rules are evaluated in order; later rules override earlier ones.
|
|
1078
|
+
* If a rule has a `map`, the JsonMap transformation is applied after `set` resolution,
|
|
1079
|
+
* and map output overrides set output on conflict.
|
|
779
1080
|
*
|
|
780
1081
|
* @param compiledRules - The compiled rules to evaluate.
|
|
781
1082
|
* @param attributes - The file attributes to match against.
|
|
1083
|
+
* @param namedMaps - Optional record of named JsonMap definitions.
|
|
1084
|
+
* @param logger - Optional pino logger for warnings (falls back to console.warn).
|
|
782
1085
|
* @returns The merged metadata from all matching rules.
|
|
783
1086
|
*/
|
|
784
|
-
function applyRules(compiledRules, attributes) {
|
|
1087
|
+
async function applyRules(compiledRules, attributes, namedMaps, logger) {
|
|
1088
|
+
// JsonMap's type definitions expect a generic JsonMapLib shape with unary functions.
|
|
1089
|
+
// Our helper functions accept multiple args, which JsonMap supports at runtime.
|
|
1090
|
+
const lib = createJsonMapLib();
|
|
785
1091
|
let merged = {};
|
|
1092
|
+
const log = logger ?? console;
|
|
786
1093
|
for (const { rule, validate } of compiledRules) {
|
|
787
1094
|
if (validate(attributes)) {
|
|
788
|
-
|
|
1095
|
+
// Apply set resolution
|
|
1096
|
+
const setOutput = resolveSet(rule.set, attributes);
|
|
1097
|
+
merged = { ...merged, ...setOutput };
|
|
1098
|
+
// Apply map transformation if present
|
|
1099
|
+
if (rule.map) {
|
|
1100
|
+
let mapDef;
|
|
1101
|
+
// Resolve map reference
|
|
1102
|
+
if (typeof rule.map === 'string') {
|
|
1103
|
+
mapDef = namedMaps?.[rule.map];
|
|
1104
|
+
if (!mapDef) {
|
|
1105
|
+
log.warn(`Map reference "${rule.map}" not found in named maps. Skipping map transformation.`);
|
|
1106
|
+
continue;
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
else {
|
|
1110
|
+
mapDef = rule.map;
|
|
1111
|
+
}
|
|
1112
|
+
// Execute JsonMap transformation
|
|
1113
|
+
try {
|
|
1114
|
+
const jsonMap = new jsonmap.JsonMap(mapDef, lib);
|
|
1115
|
+
const mapOutput = await jsonMap.transform(attributes);
|
|
1116
|
+
if (mapOutput &&
|
|
1117
|
+
typeof mapOutput === 'object' &&
|
|
1118
|
+
!Array.isArray(mapOutput)) {
|
|
1119
|
+
merged = { ...merged, ...mapOutput };
|
|
1120
|
+
}
|
|
1121
|
+
else {
|
|
1122
|
+
log.warn(`JsonMap transformation did not return an object; skipping merge.`);
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
catch (error) {
|
|
1126
|
+
log.warn(`JsonMap transformation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
789
1129
|
}
|
|
790
1130
|
}
|
|
791
1131
|
return merged;
|
|
792
1132
|
}
|
|
793
1133
|
|
|
1134
|
+
/**
|
|
1135
|
+
* @module processor/buildMetadata
|
|
1136
|
+
* Builds merged metadata from file content, inference rules, and enrichment. I/O: reads files, extracts text, loads enrichment .meta.json.
|
|
1137
|
+
*/
|
|
1138
|
+
/**
|
|
1139
|
+
* Build merged metadata for a file by applying inference rules and merging with enrichment metadata.
|
|
1140
|
+
*
|
|
1141
|
+
* @param filePath - The file to process.
|
|
1142
|
+
* @param compiledRules - The compiled inference rules.
|
|
1143
|
+
* @param metadataDir - The metadata directory for enrichment files.
|
|
1144
|
+
* @param maps - Optional named JsonMap definitions.
|
|
1145
|
+
* @param logger - Optional logger for rule warnings.
|
|
1146
|
+
* @returns The merged metadata and intermediate data.
|
|
1147
|
+
*/
|
|
1148
|
+
async function buildMergedMetadata(filePath, compiledRules, metadataDir, maps, logger) {
|
|
1149
|
+
const ext = node_path.extname(filePath);
|
|
1150
|
+
const stats = await promises.stat(filePath);
|
|
1151
|
+
// 1. Extract text and structured data
|
|
1152
|
+
const extracted = await extractText(filePath, ext);
|
|
1153
|
+
// 2. Build attributes + apply rules
|
|
1154
|
+
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
1155
|
+
const inferred = await applyRules(compiledRules, attributes, maps, logger);
|
|
1156
|
+
// 3. Read enrichment metadata (merge, enrichment wins)
|
|
1157
|
+
const enrichment = await readMetadata(filePath, metadataDir);
|
|
1158
|
+
const metadata = {
|
|
1159
|
+
...inferred,
|
|
1160
|
+
...(enrichment ?? {}),
|
|
1161
|
+
};
|
|
1162
|
+
return { inferred, enrichment, metadata, attributes, extracted };
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
/**
|
|
1166
|
+
* @module processor/chunkIds
|
|
1167
|
+
* Generates chunk point IDs from file paths and chunk indices. Extracts chunk counts from Qdrant payloads. Pure functions, no I/O.
|
|
1168
|
+
*/
|
|
1169
|
+
/**
|
|
1170
|
+
* Generate an array of chunk IDs for a file.
|
|
1171
|
+
*
|
|
1172
|
+
* @param filePath - The file path.
|
|
1173
|
+
* @param totalChunks - The total number of chunks.
|
|
1174
|
+
* @returns An array of point IDs for each chunk.
|
|
1175
|
+
*/
|
|
1176
|
+
function chunkIds(filePath, totalChunks) {
|
|
1177
|
+
const ids = [];
|
|
1178
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
1179
|
+
ids.push(pointId(filePath, i));
|
|
1180
|
+
}
|
|
1181
|
+
return ids;
|
|
1182
|
+
}
|
|
1183
|
+
/**
|
|
1184
|
+
* Extract the total chunk count from a payload, with a fallback.
|
|
1185
|
+
*
|
|
1186
|
+
* @param payload - The Qdrant point payload (or null).
|
|
1187
|
+
* @param fallback - The fallback value if total_chunks is missing or invalid.
|
|
1188
|
+
* @returns The total chunk count.
|
|
1189
|
+
*/
|
|
1190
|
+
function getChunkCount(payload, fallback = 1) {
|
|
1191
|
+
if (!payload)
|
|
1192
|
+
return fallback;
|
|
1193
|
+
const count = payload['total_chunks'];
|
|
1194
|
+
return typeof count === 'number' ? count : fallback;
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
/**
|
|
1198
|
+
* @module processor/splitter
|
|
1199
|
+
* Factory for LangChain text splitters. Returns MarkdownTextSplitter or RecursiveCharacterTextSplitter based on file extension. No I/O.
|
|
1200
|
+
*/
|
|
1201
|
+
/**
|
|
1202
|
+
* Create the appropriate text splitter for the given file extension.
|
|
1203
|
+
*
|
|
1204
|
+
* @param ext - File extension (including leading dot).
|
|
1205
|
+
* @param chunkSize - Maximum chunk size in characters.
|
|
1206
|
+
* @param chunkOverlap - Overlap between chunks in characters.
|
|
1207
|
+
* @returns A text splitter instance.
|
|
1208
|
+
*/
|
|
1209
|
+
function createSplitter(ext, chunkSize, chunkOverlap) {
|
|
1210
|
+
const lowerExt = ext.toLowerCase();
|
|
1211
|
+
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1212
|
+
return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1213
|
+
}
|
|
1214
|
+
return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
/**
|
|
1218
|
+
* @module processor
|
|
1219
|
+
*
|
|
1220
|
+
* Core document processing pipeline. Handles extracting text, computing embeddings, syncing with vector store.
|
|
1221
|
+
*/
|
|
794
1222
|
/**
|
|
795
1223
|
* Core document processing pipeline.
|
|
796
1224
|
*
|
|
@@ -802,11 +1230,10 @@ class DocumentProcessor {
|
|
|
802
1230
|
vectorStore;
|
|
803
1231
|
compiledRules;
|
|
804
1232
|
logger;
|
|
805
|
-
metadataDir;
|
|
806
1233
|
/**
|
|
807
1234
|
* Create a new DocumentProcessor.
|
|
808
1235
|
*
|
|
809
|
-
* @param config - The
|
|
1236
|
+
* @param config - The processor configuration.
|
|
810
1237
|
* @param embeddingProvider - The embedding provider.
|
|
811
1238
|
* @param vectorStore - The vector store client.
|
|
812
1239
|
* @param compiledRules - The compiled inference rules.
|
|
@@ -818,7 +1245,6 @@ class DocumentProcessor {
|
|
|
818
1245
|
this.vectorStore = vectorStore;
|
|
819
1246
|
this.compiledRules = compiledRules;
|
|
820
1247
|
this.logger = logger;
|
|
821
|
-
this.metadataDir = config.metadataDir ?? '.jeeves-metadata';
|
|
822
1248
|
}
|
|
823
1249
|
/**
|
|
824
1250
|
* Process a file through the full pipeline: extract, hash, chunk, embed, upsert.
|
|
@@ -828,9 +1254,8 @@ class DocumentProcessor {
|
|
|
828
1254
|
async processFile(filePath) {
|
|
829
1255
|
try {
|
|
830
1256
|
const ext = node_path.extname(filePath);
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
const extracted = await extractText(filePath, ext);
|
|
1257
|
+
// 1. Build merged metadata + extract text
|
|
1258
|
+
const { metadata, extracted } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
834
1259
|
if (!extracted.text.trim()) {
|
|
835
1260
|
this.logger.debug({ filePath }, 'Skipping empty file');
|
|
836
1261
|
return;
|
|
@@ -843,26 +1268,15 @@ class DocumentProcessor {
|
|
|
843
1268
|
this.logger.debug({ filePath }, 'Content unchanged, skipping');
|
|
844
1269
|
return;
|
|
845
1270
|
}
|
|
846
|
-
const oldTotalChunks =
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
const
|
|
851
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
852
|
-
// 4. Read enrichment metadata (merge, enrichment wins)
|
|
853
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
854
|
-
const metadata = {
|
|
855
|
-
...inferred,
|
|
856
|
-
...(enrichment ?? {}),
|
|
857
|
-
};
|
|
858
|
-
// 5. Chunk text
|
|
859
|
-
const chunkSize = this.config.embedding.chunkSize ?? 1000;
|
|
860
|
-
const chunkOverlap = this.config.embedding.chunkOverlap ?? 200;
|
|
861
|
-
const splitter = this.createSplitter(ext, chunkSize, chunkOverlap);
|
|
1271
|
+
const oldTotalChunks = getChunkCount(existingPayload);
|
|
1272
|
+
// 3. Chunk text
|
|
1273
|
+
const chunkSize = this.config.chunkSize ?? 1000;
|
|
1274
|
+
const chunkOverlap = this.config.chunkOverlap ?? 200;
|
|
1275
|
+
const splitter = createSplitter(ext, chunkSize, chunkOverlap);
|
|
862
1276
|
const chunks = await splitter.splitText(extracted.text);
|
|
863
|
-
//
|
|
1277
|
+
// 4. Embed all chunks
|
|
864
1278
|
const vectors = await this.embeddingProvider.embed(chunks);
|
|
865
|
-
//
|
|
1279
|
+
// 5. Upsert all chunk points
|
|
866
1280
|
const points = chunks.map((chunk, i) => ({
|
|
867
1281
|
id: pointId(filePath, i),
|
|
868
1282
|
vector: vectors[i],
|
|
@@ -876,12 +1290,9 @@ class DocumentProcessor {
|
|
|
876
1290
|
},
|
|
877
1291
|
}));
|
|
878
1292
|
await this.vectorStore.upsert(points);
|
|
879
|
-
//
|
|
1293
|
+
// 6. Clean up orphaned chunks
|
|
880
1294
|
if (oldTotalChunks > chunks.length) {
|
|
881
|
-
const orphanIds =
|
|
882
|
-
for (let i = chunks.length; i < oldTotalChunks; i++) {
|
|
883
|
-
orphanIds.push(pointId(filePath, i));
|
|
884
|
-
}
|
|
1295
|
+
const orphanIds = chunkIds(filePath, oldTotalChunks).slice(chunks.length);
|
|
885
1296
|
await this.vectorStore.delete(orphanIds);
|
|
886
1297
|
}
|
|
887
1298
|
this.logger.info({ filePath, chunks: chunks.length }, 'File processed successfully');
|
|
@@ -900,15 +1311,10 @@ class DocumentProcessor {
|
|
|
900
1311
|
// Get the existing payload to find total chunks
|
|
901
1312
|
const baseId = pointId(filePath, 0);
|
|
902
1313
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
903
|
-
const totalChunks =
|
|
904
|
-
|
|
905
|
-
: 1;
|
|
906
|
-
const ids = [];
|
|
907
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
908
|
-
ids.push(pointId(filePath, i));
|
|
909
|
-
}
|
|
1314
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1315
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
910
1316
|
await this.vectorStore.delete(ids);
|
|
911
|
-
await deleteMetadata(filePath, this.metadataDir);
|
|
1317
|
+
await deleteMetadata(filePath, this.config.metadataDir);
|
|
912
1318
|
this.logger.info({ filePath }, 'File deleted from index');
|
|
913
1319
|
}
|
|
914
1320
|
catch (error) {
|
|
@@ -925,21 +1331,16 @@ class DocumentProcessor {
|
|
|
925
1331
|
async processMetadataUpdate(filePath, metadata) {
|
|
926
1332
|
try {
|
|
927
1333
|
// Read existing enrichment metadata and merge
|
|
928
|
-
const existing = (await readMetadata(filePath, this.metadataDir)) ?? {};
|
|
1334
|
+
const existing = (await readMetadata(filePath, this.config.metadataDir)) ?? {};
|
|
929
1335
|
const merged = { ...existing, ...metadata };
|
|
930
|
-
await writeMetadata(filePath, this.metadataDir, merged);
|
|
1336
|
+
await writeMetadata(filePath, this.config.metadataDir, merged);
|
|
931
1337
|
// Update all chunk payloads in Qdrant
|
|
932
1338
|
const baseId = pointId(filePath, 0);
|
|
933
1339
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
934
1340
|
if (!existingPayload)
|
|
935
1341
|
return null;
|
|
936
|
-
const totalChunks =
|
|
937
|
-
|
|
938
|
-
: 1;
|
|
939
|
-
const ids = [];
|
|
940
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
941
|
-
ids.push(pointId(filePath, i));
|
|
942
|
-
}
|
|
1342
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1343
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
943
1344
|
await this.vectorStore.setPayload(ids, merged);
|
|
944
1345
|
this.logger.info({ filePath, chunks: totalChunks }, 'Metadata updated');
|
|
945
1346
|
return merged;
|
|
@@ -965,27 +1366,11 @@ class DocumentProcessor {
|
|
|
965
1366
|
this.logger.debug({ filePath }, 'File not indexed, skipping');
|
|
966
1367
|
return null;
|
|
967
1368
|
}
|
|
968
|
-
|
|
969
|
-
const
|
|
970
|
-
// Extract frontmatter/json for attribute building (lightweight)
|
|
971
|
-
const extracted = await extractText(filePath, ext);
|
|
972
|
-
// Build attributes + apply current rules
|
|
973
|
-
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
974
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
975
|
-
// Read enrichment metadata (merge, enrichment wins)
|
|
976
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
977
|
-
const metadata = {
|
|
978
|
-
...inferred,
|
|
979
|
-
...(enrichment ?? {}),
|
|
980
|
-
};
|
|
1369
|
+
// Build merged metadata (lightweight — no embedding)
|
|
1370
|
+
const { metadata } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
981
1371
|
// Update all chunk payloads
|
|
982
|
-
const totalChunks =
|
|
983
|
-
|
|
984
|
-
: 1;
|
|
985
|
-
const ids = [];
|
|
986
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
987
|
-
ids.push(pointId(filePath, i));
|
|
988
|
-
}
|
|
1372
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1373
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
989
1374
|
await this.vectorStore.setPayload(ids, metadata);
|
|
990
1375
|
this.logger.info({ filePath, chunks: totalChunks }, 'Rules re-applied');
|
|
991
1376
|
return metadata;
|
|
@@ -1004,23 +1389,12 @@ class DocumentProcessor {
|
|
|
1004
1389
|
this.compiledRules = compiledRules;
|
|
1005
1390
|
this.logger.info({ rules: compiledRules.length }, 'Inference rules updated');
|
|
1006
1391
|
}
|
|
1007
|
-
/**
|
|
1008
|
-
* Create the appropriate text splitter for the given file extension.
|
|
1009
|
-
*
|
|
1010
|
-
* @param ext - File extension.
|
|
1011
|
-
* @param chunkSize - Maximum chunk size in characters.
|
|
1012
|
-
* @param chunkOverlap - Overlap between chunks in characters.
|
|
1013
|
-
* @returns A text splitter instance.
|
|
1014
|
-
*/
|
|
1015
|
-
createSplitter(ext, chunkSize, chunkOverlap) {
|
|
1016
|
-
const lowerExt = ext.toLowerCase();
|
|
1017
|
-
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1018
|
-
return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1019
|
-
}
|
|
1020
|
-
return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1021
|
-
}
|
|
1022
1392
|
}
|
|
1023
1393
|
|
|
1394
|
+
/**
|
|
1395
|
+
* @module queue
|
|
1396
|
+
* Debounced, rate-limited, concurrent event queue for file watchers. Manages priority queuing and async callbacks. No direct I/O; orchestrates processing.
|
|
1397
|
+
*/
|
|
1024
1398
|
/**
|
|
1025
1399
|
* A debounced, rate-limited, concurrent event queue.
|
|
1026
1400
|
*/
|
|
@@ -1169,19 +1543,23 @@ class VectorStoreClient {
|
|
|
1169
1543
|
client;
|
|
1170
1544
|
collectionName;
|
|
1171
1545
|
dims;
|
|
1546
|
+
logger;
|
|
1172
1547
|
/**
|
|
1173
1548
|
* Create a new VectorStoreClient.
|
|
1174
1549
|
*
|
|
1175
1550
|
* @param config - Vector store configuration.
|
|
1176
1551
|
* @param dimensions - The embedding vector dimensions.
|
|
1552
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
1177
1553
|
*/
|
|
1178
|
-
constructor(config, dimensions) {
|
|
1554
|
+
constructor(config, dimensions, logger) {
|
|
1179
1555
|
this.client = new jsClientRest.QdrantClient({
|
|
1180
1556
|
url: config.url,
|
|
1181
1557
|
apiKey: config.apiKey,
|
|
1558
|
+
checkCompatibility: false,
|
|
1182
1559
|
});
|
|
1183
1560
|
this.collectionName = config.collectionName;
|
|
1184
1561
|
this.dims = dimensions;
|
|
1562
|
+
this.logger = logger;
|
|
1185
1563
|
}
|
|
1186
1564
|
/**
|
|
1187
1565
|
* Ensure the collection exists with correct dimensions and Cosine distance.
|
|
@@ -1208,13 +1586,42 @@ class VectorStoreClient {
|
|
|
1208
1586
|
async upsert(points) {
|
|
1209
1587
|
if (points.length === 0)
|
|
1210
1588
|
return;
|
|
1211
|
-
await
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1589
|
+
await retry(async (attempt) => {
|
|
1590
|
+
if (attempt > 1) {
|
|
1591
|
+
const msg = {
|
|
1592
|
+
attempt,
|
|
1593
|
+
operation: 'qdrant.upsert',
|
|
1594
|
+
points: points.length,
|
|
1595
|
+
};
|
|
1596
|
+
if (this.logger) {
|
|
1597
|
+
this.logger.warn(msg, 'Retrying Qdrant upsert');
|
|
1598
|
+
}
|
|
1599
|
+
else {
|
|
1600
|
+
console.warn(msg, 'Retrying Qdrant upsert');
|
|
1601
|
+
}
|
|
1602
|
+
}
|
|
1603
|
+
await this.client.upsert(this.collectionName, {
|
|
1604
|
+
wait: true,
|
|
1605
|
+
points: points.map((p) => ({
|
|
1606
|
+
id: p.id,
|
|
1607
|
+
vector: p.vector,
|
|
1608
|
+
payload: p.payload,
|
|
1609
|
+
})),
|
|
1610
|
+
});
|
|
1611
|
+
}, {
|
|
1612
|
+
attempts: 5,
|
|
1613
|
+
baseDelayMs: 500,
|
|
1614
|
+
maxDelayMs: 10_000,
|
|
1615
|
+
jitter: 0.2,
|
|
1616
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1617
|
+
const msg = { attempt, delayMs, operation: 'qdrant.upsert', error };
|
|
1618
|
+
if (this.logger) {
|
|
1619
|
+
this.logger.warn(msg, 'Qdrant upsert failed; will retry');
|
|
1620
|
+
}
|
|
1621
|
+
else {
|
|
1622
|
+
console.warn(msg, 'Qdrant upsert failed; will retry');
|
|
1623
|
+
}
|
|
1624
|
+
},
|
|
1218
1625
|
});
|
|
1219
1626
|
}
|
|
1220
1627
|
/**
|
|
@@ -1225,9 +1632,38 @@ class VectorStoreClient {
|
|
|
1225
1632
|
async delete(ids) {
|
|
1226
1633
|
if (ids.length === 0)
|
|
1227
1634
|
return;
|
|
1228
|
-
await
|
|
1229
|
-
|
|
1230
|
-
|
|
1635
|
+
await retry(async (attempt) => {
|
|
1636
|
+
if (attempt > 1) {
|
|
1637
|
+
const msg = {
|
|
1638
|
+
attempt,
|
|
1639
|
+
operation: 'qdrant.delete',
|
|
1640
|
+
ids: ids.length,
|
|
1641
|
+
};
|
|
1642
|
+
if (this.logger) {
|
|
1643
|
+
this.logger.warn(msg, 'Retrying Qdrant delete');
|
|
1644
|
+
}
|
|
1645
|
+
else {
|
|
1646
|
+
console.warn(msg, 'Retrying Qdrant delete');
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
await this.client.delete(this.collectionName, {
|
|
1650
|
+
wait: true,
|
|
1651
|
+
points: ids,
|
|
1652
|
+
});
|
|
1653
|
+
}, {
|
|
1654
|
+
attempts: 5,
|
|
1655
|
+
baseDelayMs: 500,
|
|
1656
|
+
maxDelayMs: 10_000,
|
|
1657
|
+
jitter: 0.2,
|
|
1658
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1659
|
+
const msg = { attempt, delayMs, operation: 'qdrant.delete', error };
|
|
1660
|
+
if (this.logger) {
|
|
1661
|
+
this.logger.warn(msg, 'Qdrant delete failed; will retry');
|
|
1662
|
+
}
|
|
1663
|
+
else {
|
|
1664
|
+
console.warn(msg, 'Qdrant delete failed; will retry');
|
|
1665
|
+
}
|
|
1666
|
+
},
|
|
1231
1667
|
});
|
|
1232
1668
|
}
|
|
1233
1669
|
/**
|
|
@@ -1327,6 +1763,10 @@ class VectorStoreClient {
|
|
|
1327
1763
|
}
|
|
1328
1764
|
}
|
|
1329
1765
|
|
|
1766
|
+
/**
|
|
1767
|
+
* @module watcher
|
|
1768
|
+
* Filesystem watcher wrapping chokidar. I/O: watches files/directories for add/change/unlink events, enqueues to processing queue.
|
|
1769
|
+
*/
|
|
1330
1770
|
/**
|
|
1331
1771
|
* Filesystem watcher that maps chokidar events to the processing queue.
|
|
1332
1772
|
*/
|
|
@@ -1424,16 +1864,22 @@ class JeevesWatcher {
|
|
|
1424
1864
|
this.logger = logger;
|
|
1425
1865
|
let embeddingProvider;
|
|
1426
1866
|
try {
|
|
1427
|
-
embeddingProvider = createEmbeddingProvider(this.config.embedding);
|
|
1867
|
+
embeddingProvider = createEmbeddingProvider(this.config.embedding, logger);
|
|
1428
1868
|
}
|
|
1429
1869
|
catch (error) {
|
|
1430
1870
|
logger.fatal({ error }, 'Failed to create embedding provider');
|
|
1431
1871
|
throw error;
|
|
1432
1872
|
}
|
|
1433
|
-
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions);
|
|
1873
|
+
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions, logger);
|
|
1434
1874
|
await vectorStore.ensureCollection();
|
|
1435
1875
|
const compiledRules = compileRules(this.config.inferenceRules ?? []);
|
|
1436
|
-
const
|
|
1876
|
+
const processorConfig = {
|
|
1877
|
+
metadataDir: this.config.metadataDir ?? '.jeeves-metadata',
|
|
1878
|
+
chunkSize: this.config.embedding.chunkSize,
|
|
1879
|
+
chunkOverlap: this.config.embedding.chunkOverlap,
|
|
1880
|
+
maps: this.config.maps,
|
|
1881
|
+
};
|
|
1882
|
+
const processor = new DocumentProcessor(processorConfig, embeddingProvider, vectorStore, compiledRules, logger);
|
|
1437
1883
|
this.processor = processor;
|
|
1438
1884
|
const queue = new EventQueue({
|
|
1439
1885
|
debounceMs: this.config.watch.debounceMs ?? 2000,
|
|
@@ -1454,7 +1900,7 @@ class JeevesWatcher {
|
|
|
1454
1900
|
this.server = server;
|
|
1455
1901
|
await server.listen({
|
|
1456
1902
|
host: this.config.api?.host ?? '127.0.0.1',
|
|
1457
|
-
port: this.config.api?.port ??
|
|
1903
|
+
port: this.config.api?.port ?? 3456,
|
|
1458
1904
|
});
|
|
1459
1905
|
watcher.start();
|
|
1460
1906
|
this.startConfigWatch();
|
|
@@ -1470,12 +1916,17 @@ class JeevesWatcher {
|
|
|
1470
1916
|
}
|
|
1471
1917
|
if (this.queue) {
|
|
1472
1918
|
const timeout = this.config.shutdownTimeoutMs ?? 10000;
|
|
1473
|
-
await Promise.race([
|
|
1474
|
-
this.queue.drain(),
|
|
1919
|
+
const drained = await Promise.race([
|
|
1920
|
+
this.queue.drain().then(() => true),
|
|
1475
1921
|
new Promise((resolve) => {
|
|
1476
|
-
setTimeout(
|
|
1922
|
+
setTimeout(() => {
|
|
1923
|
+
resolve(false);
|
|
1924
|
+
}, timeout);
|
|
1477
1925
|
}),
|
|
1478
1926
|
]);
|
|
1927
|
+
if (!drained) {
|
|
1928
|
+
this.logger?.warn({ timeoutMs: timeout }, 'Queue drain timeout hit, forcing shutdown');
|
|
1929
|
+
}
|
|
1479
1930
|
}
|
|
1480
1931
|
if (this.server) {
|
|
1481
1932
|
await this.server.close();
|
|
@@ -1524,6 +1975,7 @@ class JeevesWatcher {
|
|
|
1524
1975
|
const processor = this.processor;
|
|
1525
1976
|
if (!logger || !processor || !this.configPath)
|
|
1526
1977
|
return;
|
|
1978
|
+
logger.info({ configPath: this.configPath }, 'Config change detected, reloading...');
|
|
1527
1979
|
try {
|
|
1528
1980
|
const newConfig = await loadConfig(this.configPath);
|
|
1529
1981
|
this.config = newConfig;
|
|
@@ -1560,18 +2012,26 @@ exports.EventQueue = EventQueue;
|
|
|
1560
2012
|
exports.FileSystemWatcher = FileSystemWatcher;
|
|
1561
2013
|
exports.JeevesWatcher = JeevesWatcher;
|
|
1562
2014
|
exports.VectorStoreClient = VectorStoreClient;
|
|
2015
|
+
exports.apiConfigSchema = apiConfigSchema;
|
|
1563
2016
|
exports.applyRules = applyRules;
|
|
1564
2017
|
exports.buildAttributes = buildAttributes;
|
|
1565
2018
|
exports.compileRules = compileRules;
|
|
2019
|
+
exports.configWatchConfigSchema = configWatchConfigSchema;
|
|
1566
2020
|
exports.contentHash = contentHash;
|
|
1567
2021
|
exports.createApiServer = createApiServer;
|
|
1568
2022
|
exports.createEmbeddingProvider = createEmbeddingProvider;
|
|
1569
2023
|
exports.createLogger = createLogger;
|
|
1570
2024
|
exports.deleteMetadata = deleteMetadata;
|
|
2025
|
+
exports.embeddingConfigSchema = embeddingConfigSchema;
|
|
1571
2026
|
exports.extractText = extractText;
|
|
2027
|
+
exports.inferenceRuleSchema = inferenceRuleSchema;
|
|
2028
|
+
exports.jeevesWatcherConfigSchema = jeevesWatcherConfigSchema;
|
|
1572
2029
|
exports.loadConfig = loadConfig;
|
|
2030
|
+
exports.loggingConfigSchema = loggingConfigSchema;
|
|
1573
2031
|
exports.metadataPath = metadataPath;
|
|
1574
2032
|
exports.pointId = pointId;
|
|
1575
2033
|
exports.readMetadata = readMetadata;
|
|
1576
2034
|
exports.startFromConfig = startFromConfig;
|
|
2035
|
+
exports.vectorStoreConfigSchema = vectorStoreConfigSchema;
|
|
2036
|
+
exports.watchConfigSchema = watchConfigSchema;
|
|
1577
2037
|
exports.writeMetadata = writeMetadata;
|