@karmaniverous/jeeves-watcher 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- (function (exports, Fastify, node_crypto, promises, node_path, picomatch, chokidar, Ajv, cosmiconfig, googleGenai, pino, textsplitters, cheerio, yaml, mammoth, uuid, addFormats, jsClientRest) {
1
+ (function (exports, Fastify, promises, node_path, picomatch, radash, node_crypto, cosmiconfig, zod, jsonmap, googleGenai, pino, uuid, cheerio, yaml, mammoth, Ajv, addFormats, textsplitters, jsClientRest, chokidar) {
2
2
  'use strict';
3
3
 
4
4
  function _interopNamespaceDefault(e) {
@@ -20,73 +20,6 @@
20
20
 
21
21
  var cheerio__namespace = /*#__PURE__*/_interopNamespaceDefault(cheerio);
22
22
 
23
- /**
24
- * Normalise a file path for deterministic mapping: lowercase, forward slashes, strip leading drive letter colon.
25
- *
26
- * @param filePath - The original file path.
27
- * @returns The normalised path string.
28
- */
29
- function normalisePath$1(filePath) {
30
- return filePath
31
- .replace(/\\/g, '/')
32
- .replace(/^([A-Za-z]):/, (_m, letter) => letter.toLowerCase())
33
- .toLowerCase();
34
- }
35
- /**
36
- * Derive a deterministic `.meta.json` path for a given file.
37
- *
38
- * @param filePath - The watched file path.
39
- * @param metadataDir - The root metadata directory.
40
- * @returns The full path to the metadata file.
41
- */
42
- function metadataPath(filePath, metadataDir) {
43
- const normalised = normalisePath$1(filePath);
44
- const hash = node_crypto.createHash('sha256').update(normalised, 'utf8').digest('hex');
45
- return node_path.join(metadataDir, `${hash}.meta.json`);
46
- }
47
- /**
48
- * Read persisted metadata for a file.
49
- *
50
- * @param filePath - The watched file path.
51
- * @param metadataDir - The root metadata directory.
52
- * @returns The parsed metadata object, or `null` if not found.
53
- */
54
- async function readMetadata(filePath, metadataDir) {
55
- try {
56
- const raw = await promises.readFile(metadataPath(filePath, metadataDir), 'utf8');
57
- return JSON.parse(raw);
58
- }
59
- catch {
60
- return null;
61
- }
62
- }
63
- /**
64
- * Write metadata for a file.
65
- *
66
- * @param filePath - The watched file path.
67
- * @param metadataDir - The root metadata directory.
68
- * @param metadata - The metadata to persist.
69
- */
70
- async function writeMetadata(filePath, metadataDir, metadata) {
71
- const dest = metadataPath(filePath, metadataDir);
72
- await promises.mkdir(node_path.dirname(dest), { recursive: true });
73
- await promises.writeFile(dest, JSON.stringify(metadata, null, 2), 'utf8');
74
- }
75
- /**
76
- * Delete metadata for a file.
77
- *
78
- * @param filePath - The watched file path.
79
- * @param metadataDir - The root metadata directory.
80
- */
81
- async function deleteMetadata(filePath, metadataDir) {
82
- try {
83
- await promises.rm(metadataPath(filePath, metadataDir));
84
- }
85
- catch {
86
- // Ignore if file doesn't exist.
87
- }
88
- }
89
-
90
23
  /**
91
24
  * Best-effort base directory inference for a glob pattern.
92
25
  *
@@ -160,233 +93,536 @@
160
93
  }
161
94
 
162
95
  /**
163
- * Create the Fastify API server with all routes registered.
96
+ * @module processAllFiles
164
97
  *
165
- * The returned instance is not yet listening call `server.listen()` to start.
98
+ * Shared helper for processing all files matching configured globs.
99
+ */
100
+ /**
101
+ * Process all files from globs using the specified processor method.
166
102
  *
167
- * @param options - The server options.
168
- * @returns A configured Fastify instance.
103
+ * @param watchPaths - The glob patterns to match.
104
+ * @param ignoredPaths - The glob patterns to ignore.
105
+ * @param processor - The document processor instance.
106
+ * @param method - The processor method to call ('processFile' or 'processRulesUpdate').
107
+ * @returns The number of files processed.
169
108
  */
170
- function createApiServer(options) {
171
- const { processor, vectorStore, embeddingProvider, logger } = options;
172
- const app = Fastify({ logger: false });
173
- app.get('/status', () => ({
174
- status: 'ok',
175
- uptime: process.uptime(),
176
- }));
177
- app.post('/metadata', async (request, reply) => {
109
+ async function processAllFiles(watchPaths, ignoredPaths, processor, method) {
110
+ const files = await listFilesFromGlobs(watchPaths, ignoredPaths);
111
+ for (const file of files) {
112
+ // Sequential on purpose to avoid surprising load.
113
+ // Queue integration can come later.
114
+ await processor[method](file);
115
+ }
116
+ return files.length;
117
+ }
118
+
119
+ /**
120
+ * @module api/handlers/configReindex
121
+ * Fastify route handler for POST /config-reindex. Triggers an async reindex job scoped to rules or full processing.
122
+ */
123
+ /**
124
+ * Create handler for POST /config-reindex.
125
+ *
126
+ * @param deps - Route dependencies.
127
+ */
128
+ function createConfigReindexHandler(deps) {
129
+ return async (request, reply) => {
178
130
  try {
179
- const { path, metadata } = request.body;
180
- await processor.processMetadataUpdate(path, metadata);
181
- return { ok: true };
131
+ const scope = request.body.scope ?? 'rules';
132
+ // Return immediately and run async
133
+ void (async () => {
134
+ try {
135
+ if (scope === 'rules') {
136
+ const count = await processAllFiles(deps.config.watch.paths, deps.config.watch.ignored, deps.processor, 'processRulesUpdate');
137
+ deps.logger.info({ scope, filesProcessed: count }, 'Config reindex (rules) completed');
138
+ }
139
+ else {
140
+ const count = await processAllFiles(deps.config.watch.paths, deps.config.watch.ignored, deps.processor, 'processFile');
141
+ deps.logger.info({ scope, filesProcessed: count }, 'Config reindex (full) completed');
142
+ }
143
+ }
144
+ catch (error) {
145
+ deps.logger.error({ error, scope }, 'Config reindex failed');
146
+ }
147
+ })();
148
+ return await reply.status(200).send({ status: 'started', scope });
182
149
  }
183
150
  catch (error) {
184
- logger.error({ error }, 'Metadata update failed');
185
- return reply.status(500).send({ error: 'Internal server error' });
151
+ deps.logger.error({ error }, 'Config reindex request failed');
152
+ return await reply.status(500).send({ error: 'Internal server error' });
186
153
  }
187
- });
188
- app.post('/search', async (request, reply) => {
154
+ };
155
+ }
156
+
157
+ /**
158
+ * @module api/handlers/metadata
159
+ * Fastify route handler for POST /metadata. Performs enrichment metadata updates via the document processor.
160
+ */
161
+ /**
162
+ * Create handler for POST /metadata.
163
+ *
164
+ * @param deps - Route dependencies.
165
+ */
166
+ function createMetadataHandler(deps) {
167
+ return async (request, reply) => {
189
168
  try {
190
- const { query, limit = 10 } = request.body;
191
- const vectors = await embeddingProvider.embed([query]);
192
- const results = await vectorStore.search(vectors[0], limit);
193
- return results;
169
+ const { path, metadata } = request.body;
170
+ await deps.processor.processMetadataUpdate(path, metadata);
171
+ return { ok: true };
194
172
  }
195
173
  catch (error) {
196
- logger.error({ error }, 'Search failed');
174
+ deps.logger.error({ error }, 'Metadata update failed');
197
175
  return reply.status(500).send({ error: 'Internal server error' });
198
176
  }
199
- });
200
- app.post('/reindex', async (_request, reply) => {
201
- try {
202
- const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
203
- for (const file of files) {
204
- // Sequential on purpose to avoid surprising load.
205
- // Queue integration can come later.
206
- await processor.processFile(file);
207
- }
208
- return await reply
209
- .status(200)
210
- .send({ ok: true, filesIndexed: files.length });
211
- }
212
- catch (error) {
213
- logger.error({ error }, 'Reindex failed');
214
- return await reply.status(500).send({ error: 'Internal server error' });
215
- }
216
- });
217
- app.post('/rebuild-metadata', async (_request, reply) => {
177
+ };
178
+ }
179
+
180
+ /**
181
+ * @module util/normalizePath
182
+ * Normalizes file paths for deterministic mapping: lowercase, forward slashes, optional drive letter stripping.
183
+ */
184
+ /**
185
+ * Normalize a file path: lowercase, forward slashes, optionally strip drive letter colon.
186
+ *
187
+ * @param filePath - The original file path.
188
+ * @param stripDriveLetter - Whether to strip the colon from a leading drive letter (e.g. `C:` → `c`).
189
+ * @returns The normalized path string.
190
+ */
191
+ function normalizePath(filePath, stripDriveLetter = false) {
192
+ let result = filePath.replace(/\\/g, '/').toLowerCase();
193
+ if (stripDriveLetter) {
194
+ result = result.replace(/^([a-z]):/, (_m, letter) => letter);
195
+ }
196
+ return result;
197
+ }
198
+
199
+ /**
200
+ * @module metadata/metadata
201
+ * Persists file metadata as .meta.json. I/O: reads/writes/deletes metadata files under metadataDir. Path mapping via SHA-256 hash.
202
+ */
203
+ /**
204
+ * Derive a deterministic `.meta.json` path for a given file.
205
+ *
206
+ * @param filePath - The watched file path.
207
+ * @param metadataDir - The root metadata directory.
208
+ * @returns The full path to the metadata file.
209
+ */
210
+ function metadataPath(filePath, metadataDir) {
211
+ const normalised = normalizePath(filePath, true);
212
+ const hash = node_crypto.createHash('sha256').update(normalised, 'utf8').digest('hex');
213
+ return node_path.join(metadataDir, `${hash}.meta.json`);
214
+ }
215
+ /**
216
+ * Read persisted metadata for a file.
217
+ *
218
+ * @param filePath - The watched file path.
219
+ * @param metadataDir - The root metadata directory.
220
+ * @returns The parsed metadata object, or `null` if not found.
221
+ */
222
+ async function readMetadata(filePath, metadataDir) {
223
+ try {
224
+ const raw = await promises.readFile(metadataPath(filePath, metadataDir), 'utf8');
225
+ return JSON.parse(raw);
226
+ }
227
+ catch {
228
+ return null;
229
+ }
230
+ }
231
+ /**
232
+ * Write metadata for a file.
233
+ *
234
+ * @param filePath - The watched file path.
235
+ * @param metadataDir - The root metadata directory.
236
+ * @param metadata - The metadata to persist.
237
+ */
238
+ async function writeMetadata(filePath, metadataDir, metadata) {
239
+ const dest = metadataPath(filePath, metadataDir);
240
+ await promises.mkdir(node_path.dirname(dest), { recursive: true });
241
+ await promises.writeFile(dest, JSON.stringify(metadata, null, 2), 'utf8');
242
+ }
243
+ /**
244
+ * Delete metadata for a file.
245
+ *
246
+ * @param filePath - The watched file path.
247
+ * @param metadataDir - The root metadata directory.
248
+ */
249
+ async function deleteMetadata(filePath, metadataDir) {
250
+ try {
251
+ await promises.rm(metadataPath(filePath, metadataDir));
252
+ }
253
+ catch {
254
+ // Ignore if file doesn't exist.
255
+ }
256
+ }
257
+
258
+ /**
259
+ * @module metadata/constants
260
+ * Shared constants for metadata key classification. System keys are injected by the indexing pipeline, not user-provided.
261
+ */
262
+ /** Keys managed by the indexing pipeline (not user enrichment). */
263
+ const SYSTEM_METADATA_KEYS = [
264
+ 'file_path',
265
+ 'chunk_index',
266
+ 'total_chunks',
267
+ 'content_hash',
268
+ 'chunk_text',
269
+ ];
270
+
271
+ /**
272
+ * @module api/handlers/rebuildMetadata
273
+ * Fastify route handler for POST /rebuild-metadata. Recreates enrichment metadata files from vector store payloads.
274
+ */
275
+ /**
276
+ * Create handler for POST /rebuild-metadata.
277
+ *
278
+ * @param deps - Route dependencies.
279
+ */
280
+ function createRebuildMetadataHandler(deps) {
281
+ return async (_request, reply) => {
218
282
  try {
219
- const metadataDir = options.config.metadataDir ?? '.jeeves-metadata';
220
- for await (const point of vectorStore.scroll()) {
283
+ const metadataDir = deps.config.metadataDir ?? '.jeeves-metadata';
284
+ const systemKeys = [...SYSTEM_METADATA_KEYS];
285
+ for await (const point of deps.vectorStore.scroll()) {
221
286
  const payload = point.payload;
222
287
  const filePath = payload['file_path'];
223
288
  if (typeof filePath !== 'string' || filePath.length === 0)
224
289
  continue;
225
290
  // Persist only enrichment-ish fields, not chunking/index fields.
226
- const rest = { ...payload };
227
- delete rest.file_path;
228
- delete rest.chunk_index;
229
- delete rest.total_chunks;
230
- delete rest.content_hash;
231
- delete rest.chunk_text;
232
- await writeMetadata(filePath, metadataDir, rest);
291
+ const enrichment = radash.omit(payload, systemKeys);
292
+ await writeMetadata(filePath, metadataDir, enrichment);
233
293
  }
234
294
  return await reply.status(200).send({ ok: true });
235
295
  }
236
296
  catch (error) {
237
- logger.error({ error }, 'Rebuild metadata failed');
297
+ deps.logger.error({ error }, 'Rebuild metadata failed');
238
298
  return await reply.status(500).send({ error: 'Internal server error' });
239
299
  }
240
- });
241
- app.post('/config-reindex', async (request, reply) => {
300
+ };
301
+ }
302
+
303
+ /**
304
+ * @module api/handlers/reindex
305
+ * Fastify route handler for POST /reindex. Reprocesses all watched files through the processor.
306
+ */
307
+ /**
308
+ * Create handler for POST /reindex.
309
+ *
310
+ * @param deps - Route dependencies.
311
+ */
312
+ function createReindexHandler(deps) {
313
+ return async (_request, reply) => {
242
314
  try {
243
- const scope = request.body.scope ?? 'rules';
244
- // Return immediately and run async
245
- void (async () => {
246
- try {
247
- if (scope === 'rules') {
248
- // Re-apply inference rules to all files, update Qdrant payloads (no re-embedding)
249
- const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
250
- for (const file of files) {
251
- // Use the new processRulesUpdate method
252
- await processor.processRulesUpdate(file);
253
- }
254
- logger.info({ scope, filesProcessed: files.length }, 'Config reindex (rules) completed');
255
- }
256
- else {
257
- // Full reindex: re-extract, re-embed, re-upsert
258
- const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
259
- for (const file of files) {
260
- await processor.processFile(file);
261
- }
262
- logger.info({ scope, filesProcessed: files.length }, 'Config reindex (full) completed');
263
- }
264
- }
265
- catch (error) {
266
- logger.error({ error, scope }, 'Config reindex failed');
267
- }
268
- })();
269
- return await reply.status(200).send({ status: 'started', scope });
315
+ const count = await processAllFiles(deps.config.watch.paths, deps.config.watch.ignored, deps.processor, 'processFile');
316
+ return await reply.status(200).send({ ok: true, filesIndexed: count });
270
317
  }
271
318
  catch (error) {
272
- logger.error({ error }, 'Config reindex request failed');
319
+ deps.logger.error({ error }, 'Reindex failed');
273
320
  return await reply.status(500).send({ error: 'Internal server error' });
274
321
  }
322
+ };
323
+ }
324
+
325
+ /**
326
+ * @module api/handlers/search
327
+ * Fastify route handler for POST /search. Embeds a query and performs vector store similarity search.
328
+ */
329
+ /**
330
+ * Create handler for POST /search.
331
+ *
332
+ * @param deps - Route dependencies.
333
+ */
334
+ function createSearchHandler(deps) {
335
+ return async (request, reply) => {
336
+ try {
337
+ const { query, limit = 10 } = request.body;
338
+ const vectors = await deps.embeddingProvider.embed([query]);
339
+ const results = await deps.vectorStore.search(vectors[0], limit);
340
+ return results;
341
+ }
342
+ catch (error) {
343
+ deps.logger.error({ error }, 'Search failed');
344
+ return reply.status(500).send({ error: 'Internal server error' });
345
+ }
346
+ };
347
+ }
348
+
349
+ /**
350
+ * @module api/handlers/status
351
+ * Fastify route handler for GET /status. Pure handler: returns process uptime and health.
352
+ */
353
+ /**
354
+ * Create handler for GET /status.
355
+ */
356
+ function createStatusHandler() {
357
+ return () => ({
358
+ status: 'ok',
359
+ uptime: process.uptime(),
275
360
  });
361
+ }
362
+
363
+ /**
364
+ * Create the Fastify API server with all routes registered.
365
+ *
366
+ * The returned instance is not yet listening — call `server.listen()` to start.
367
+ *
368
+ * @param options - The server options.
369
+ * @returns A configured Fastify instance.
370
+ */
371
+ function createApiServer(options) {
372
+ const { processor, vectorStore, embeddingProvider, logger, config } = options;
373
+ const app = Fastify({ logger: false });
374
+ app.get('/status', createStatusHandler());
375
+ app.post('/metadata', createMetadataHandler({ processor, logger }));
376
+ app.post('/search', createSearchHandler({ embeddingProvider, vectorStore, logger }));
377
+ app.post('/reindex', createReindexHandler({ config, processor, logger }));
378
+ app.post('/rebuild-metadata', createRebuildMetadataHandler({ config, vectorStore, logger }));
379
+ app.post('/config-reindex', createConfigReindexHandler({ config, processor, logger }));
276
380
  return app;
277
381
  }
278
382
 
279
- const MODULE_NAME = 'jeeves-watcher';
280
- /** JSON Schema for validating jeeves-watcher configuration. */
281
- const configSchema = {
282
- type: 'object',
283
- required: ['watch', 'embedding', 'vectorStore'],
284
- properties: {
285
- watch: {
286
- type: 'object',
287
- required: ['paths'],
288
- properties: {
289
- paths: { type: 'array', items: { type: 'string' }, minItems: 1 },
290
- ignored: { type: 'array', items: { type: 'string' } },
291
- pollIntervalMs: { type: 'number' },
292
- usePolling: { type: 'boolean' },
293
- debounceMs: { type: 'number' },
294
- stabilityThresholdMs: { type: 'number' },
295
- },
296
- additionalProperties: false,
297
- },
298
- configWatch: {
299
- type: 'object',
300
- properties: {
301
- enabled: { type: 'boolean' },
302
- debounceMs: { type: 'number' },
303
- },
304
- additionalProperties: false,
305
- },
306
- embedding: {
307
- type: 'object',
308
- required: ['provider', 'model'],
309
- properties: {
310
- provider: { type: 'string' },
311
- model: { type: 'string' },
312
- chunkSize: { type: 'number' },
313
- chunkOverlap: { type: 'number' },
314
- dimensions: { type: 'number' },
315
- apiKey: { type: 'string' },
316
- rateLimitPerMinute: { type: 'number' },
317
- concurrency: { type: 'number' },
318
- },
319
- additionalProperties: false,
320
- },
321
- vectorStore: {
322
- type: 'object',
323
- required: ['url', 'collectionName'],
324
- properties: {
325
- url: { type: 'string' },
326
- collectionName: { type: 'string' },
327
- apiKey: { type: 'string' },
328
- },
329
- additionalProperties: false,
330
- },
331
- metadataDir: { type: 'string' },
332
- api: {
333
- type: 'object',
334
- properties: {
335
- host: { type: 'string' },
336
- port: { type: 'number' },
337
- },
338
- additionalProperties: false,
339
- },
340
- extractors: { type: 'object' },
341
- inferenceRules: {
342
- type: 'array',
343
- items: {
344
- type: 'object',
345
- required: ['match', 'set'],
346
- properties: {
347
- match: { type: 'object' },
348
- set: { type: 'object' },
349
- },
350
- additionalProperties: false,
351
- },
352
- },
353
- logging: {
354
- type: 'object',
355
- properties: {
356
- level: { type: 'string' },
357
- file: { type: 'string' },
358
- },
359
- additionalProperties: false,
360
- },
361
- shutdownTimeoutMs: { type: 'number' },
362
- },
363
- additionalProperties: false,
364
- };
365
- const ajv = new Ajv({ allErrors: true });
366
- const validate = ajv.compile(configSchema);
367
- /** Default values for optional configuration fields. */
368
- const DEFAULTS = {
369
- configWatch: { enabled: true, debounceMs: 1000 },
383
+ /**
384
+ * @module config/defaults
385
+ * Default configuration values for jeeves-watcher. Pure data export, no I/O or side effects.
386
+ */
387
+ /** Default root-level config values. */
388
+ const ROOT_DEFAULTS = {
370
389
  metadataDir: '.jeeves-watcher',
371
- api: { host: '127.0.0.1', port: 3100 },
372
- logging: { level: 'info' },
373
390
  shutdownTimeoutMs: 10000,
374
391
  };
375
- /** Default values for watch configuration. */
392
+ /** Default configWatch values. */
393
+ const CONFIG_WATCH_DEFAULTS = {
394
+ enabled: true,
395
+ debounceMs: 1000,
396
+ };
397
+ /** Default API values. */
398
+ const API_DEFAULTS = {
399
+ host: '127.0.0.1',
400
+ port: 3456,
401
+ };
402
+ /** Default logging values. */
403
+ const LOGGING_DEFAULTS = {
404
+ level: 'info',
405
+ };
406
+ /** Default watch configuration. */
376
407
  const WATCH_DEFAULTS = {
377
408
  debounceMs: 300,
378
409
  stabilityThresholdMs: 500,
379
410
  usePolling: false,
380
411
  pollIntervalMs: 1000,
381
412
  };
382
- /** Default values for embedding configuration. */
413
+ /** Default embedding configuration. */
383
414
  const EMBEDDING_DEFAULTS = {
384
415
  chunkSize: 1000,
385
416
  chunkOverlap: 200,
386
- dimensions: 768,
417
+ dimensions: 3072,
387
418
  rateLimitPerMinute: 300,
388
419
  concurrency: 5,
389
420
  };
421
+
422
+ /**
423
+ * Watch configuration for file system monitoring.
424
+ */
425
+ const watchConfigSchema = zod.z.object({
426
+ /** Glob patterns to watch. */
427
+ paths: zod.z
428
+ .array(zod.z.string())
429
+ .min(1)
430
+ .describe('Glob patterns for files to watch (e.g., "**/*.md"). At least one required.'),
431
+ /** Glob patterns to ignore. */
432
+ ignored: zod.z
433
+ .array(zod.z.string())
434
+ .optional()
435
+ .describe('Glob patterns to exclude from watching (e.g., "**/node_modules/**").'),
436
+ /** Polling interval in milliseconds. */
437
+ pollIntervalMs: zod.z
438
+ .number()
439
+ .optional()
440
+ .describe('Polling interval in milliseconds when usePolling is enabled.'),
441
+ /** Whether to use polling instead of native watchers. */
442
+ usePolling: zod.z
443
+ .boolean()
444
+ .optional()
445
+ .describe('Use polling instead of native file system events (for network drives).'),
446
+ /** Debounce delay in milliseconds for file change events. */
447
+ debounceMs: zod.z
448
+ .number()
449
+ .optional()
450
+ .describe('Debounce delay in milliseconds for file change events.'),
451
+ /** Time in milliseconds a file must be stable before processing. */
452
+ stabilityThresholdMs: zod.z
453
+ .number()
454
+ .optional()
455
+ .describe('Time in milliseconds a file must remain unchanged before processing.'),
456
+ });
457
+ /**
458
+ * Configuration watch settings.
459
+ */
460
+ const configWatchConfigSchema = zod.z.object({
461
+ /** Whether config file watching is enabled. */
462
+ enabled: zod.z
463
+ .boolean()
464
+ .optional()
465
+ .describe('Enable automatic reloading when config file changes.'),
466
+ /** Debounce delay in milliseconds for config change events. */
467
+ debounceMs: zod.z
468
+ .number()
469
+ .optional()
470
+ .describe('Debounce delay in milliseconds for config file change detection.'),
471
+ });
472
+ /**
473
+ * Embedding model configuration.
474
+ */
475
+ const embeddingConfigSchema = zod.z.object({
476
+ /** The embedding model provider. */
477
+ provider: zod.z
478
+ .string()
479
+ .default('gemini')
480
+ .describe('Embedding provider name (e.g., "gemini", "openai").'),
481
+ /** The embedding model name. */
482
+ model: zod.z
483
+ .string()
484
+ .default('gemini-embedding-001')
485
+ .describe('Embedding model identifier (e.g., "gemini-embedding-001", "text-embedding-3-small").'),
486
+ /** Maximum tokens per chunk for splitting. */
487
+ chunkSize: zod.z
488
+ .number()
489
+ .optional()
490
+ .describe('Maximum chunk size in characters for text splitting.'),
491
+ /** Overlap between chunks in tokens. */
492
+ chunkOverlap: zod.z
493
+ .number()
494
+ .optional()
495
+ .describe('Character overlap between consecutive chunks.'),
496
+ /** Embedding vector dimensions. */
497
+ dimensions: zod.z
498
+ .number()
499
+ .optional()
500
+ .describe('Embedding vector dimensions (must match model output).'),
501
+ /** API key for the embedding provider. */
502
+ apiKey: zod.z
503
+ .string()
504
+ .optional()
505
+ .describe('API key for embedding provider (supports ${ENV_VAR} substitution).'),
506
+ /** Maximum embedding requests per minute. */
507
+ rateLimitPerMinute: zod.z
508
+ .number()
509
+ .optional()
510
+ .describe('Maximum embedding API requests per minute (rate limiting).'),
511
+ /** Maximum concurrent embedding requests. */
512
+ concurrency: zod.z
513
+ .number()
514
+ .optional()
515
+ .describe('Maximum concurrent embedding requests.'),
516
+ });
517
+ /**
518
+ * Vector store configuration for Qdrant.
519
+ */
520
+ const vectorStoreConfigSchema = zod.z.object({
521
+ /** Qdrant server URL. */
522
+ url: zod.z
523
+ .string()
524
+ .describe('Qdrant server URL (e.g., "http://localhost:6333").'),
525
+ /** Qdrant collection name. */
526
+ collectionName: zod.z
527
+ .string()
528
+ .describe('Qdrant collection name for vector storage.'),
529
+ /** Qdrant API key. */
530
+ apiKey: zod.z
531
+ .string()
532
+ .optional()
533
+ .describe('Qdrant API key for authentication (supports ${ENV_VAR} substitution).'),
534
+ });
535
+ /**
536
+ * API server configuration.
537
+ */
538
+ const apiConfigSchema = zod.z.object({
539
+ /** Host to bind to. */
540
+ host: zod.z
541
+ .string()
542
+ .optional()
543
+ .describe('Host address for API server (e.g., "127.0.0.1", "0.0.0.0").'),
544
+ /** Port to listen on. */
545
+ port: zod.z.number().optional().describe('Port for API server (e.g., 3456).'),
546
+ });
547
+ /**
548
+ * Logging configuration.
549
+ */
550
+ const loggingConfigSchema = zod.z.object({
551
+ /** Log level. */
552
+ level: zod.z
553
+ .string()
554
+ .optional()
555
+ .describe('Logging level (trace, debug, info, warn, error, fatal).'),
556
+ /** Log file path. */
557
+ file: zod.z
558
+ .string()
559
+ .optional()
560
+ .describe('Path to log file (logs to stdout if omitted).'),
561
+ });
562
+ /**
563
+ * An inference rule that enriches document metadata.
564
+ */
565
+ const inferenceRuleSchema = zod.z.object({
566
+ /** JSON Schema object to match against document metadata. */
567
+ match: zod.z
568
+ .record(zod.z.string(), zod.z.unknown())
569
+ .describe('JSON Schema object to match against file attributes.'),
570
+ /** Metadata fields to set when the rule matches. */
571
+ set: zod.z
572
+ .record(zod.z.string(), zod.z.unknown())
573
+ .describe('Metadata fields to set when match succeeds.'),
574
+ /** JsonMap transformation (inline or reference to named map). */
575
+ map: zod.z
576
+ .union([jsonmap.jsonMapMapSchema, zod.z.string()])
577
+ .optional()
578
+ .describe('JsonMap transformation (inline definition or named map reference).'),
579
+ });
580
+ /**
581
+ * Top-level configuration for jeeves-watcher.
582
+ */
583
+ const jeevesWatcherConfigSchema = zod.z.object({
584
+ /** File system watch configuration. */
585
+ watch: watchConfigSchema.describe('File system watch configuration.'),
586
+ /** Configuration file watch settings. */
587
+ configWatch: configWatchConfigSchema
588
+ .optional()
589
+ .describe('Configuration file watch settings.'),
590
+ /** Embedding model configuration. */
591
+ embedding: embeddingConfigSchema.describe('Embedding model configuration.'),
592
+ /** Vector store configuration. */
593
+ vectorStore: vectorStoreConfigSchema.describe('Qdrant vector store configuration.'),
594
+ /** Directory for persisted metadata. */
595
+ metadataDir: zod.z
596
+ .string()
597
+ .optional()
598
+ .describe('Directory for persisted metadata sidecar files.'),
599
+ /** API server configuration. */
600
+ api: apiConfigSchema.optional().describe('API server configuration.'),
601
+ /** Extractor configurations keyed by name. */
602
+ extractors: zod.z
603
+ .record(zod.z.string(), zod.z.unknown())
604
+ .optional()
605
+ .describe('Extractor configurations keyed by name.'),
606
+ /** Rules for inferring metadata from document properties. */
607
+ inferenceRules: zod.z
608
+ .array(inferenceRuleSchema)
609
+ .optional()
610
+ .describe('Rules for inferring metadata from file attributes.'),
611
+ /** Reusable named JsonMap transformations. */
612
+ maps: zod.z
613
+ .record(zod.z.string(), jsonmap.jsonMapMapSchema)
614
+ .optional()
615
+ .describe('Reusable named JsonMap transformations.'),
616
+ /** Logging configuration. */
617
+ logging: loggingConfigSchema.optional().describe('Logging configuration.'),
618
+ /** Timeout in milliseconds for graceful shutdown. */
619
+ shutdownTimeoutMs: zod.z
620
+ .number()
621
+ .optional()
622
+ .describe('Timeout in milliseconds for graceful shutdown.'),
623
+ });
624
+
625
+ const MODULE_NAME = 'jeeves-watcher';
390
626
  /**
391
627
  * Merge sensible defaults into a loaded configuration.
392
628
  *
@@ -395,13 +631,13 @@
395
631
  */
396
632
  function applyDefaults(raw) {
397
633
  return {
398
- ...DEFAULTS,
634
+ ...ROOT_DEFAULTS,
399
635
  ...raw,
400
636
  watch: { ...WATCH_DEFAULTS, ...raw.watch },
401
- configWatch: { ...DEFAULTS.configWatch, ...raw.configWatch },
637
+ configWatch: { ...CONFIG_WATCH_DEFAULTS, ...raw.configWatch },
402
638
  embedding: { ...EMBEDDING_DEFAULTS, ...raw.embedding },
403
- api: { ...DEFAULTS.api, ...raw.api },
404
- logging: { ...DEFAULTS.logging, ...raw.logging },
639
+ api: { ...API_DEFAULTS, ...raw.api },
640
+ logging: { ...LOGGING_DEFAULTS, ...raw.logging },
405
641
  };
406
642
  }
407
643
  /**
@@ -419,21 +655,114 @@
419
655
  if (!result || result.isEmpty) {
420
656
  throw new Error('No jeeves-watcher configuration found. Create a .jeeves-watcherrc or jeeves-watcher.config.{js,ts,json,yaml} file.');
421
657
  }
422
- const raw = result.config;
423
- if (!validate(raw)) {
424
- const errors = validate.errors
425
- ?.map((e) => {
426
- const instancePath = 'instancePath' in e
427
- ? e.instancePath
428
- : undefined;
429
- return `${instancePath ?? '/'}: ${e.message ?? 'unknown error'}`;
430
- })
431
- .join('; ');
432
- throw new Error(`Invalid jeeves-watcher configuration: ${errors ?? 'unknown error'}`);
658
+ try {
659
+ const validated = jeevesWatcherConfigSchema.parse(result.config);
660
+ return applyDefaults(validated);
661
+ }
662
+ catch (error) {
663
+ if (error instanceof zod.ZodError) {
664
+ const errors = error.issues
665
+ .map((issue) => `${issue.path.join('.')}: ${issue.message}`)
666
+ .join('; ');
667
+ throw new Error(`Invalid jeeves-watcher configuration: ${errors}`);
668
+ }
669
+ throw error;
433
670
  }
434
- return applyDefaults(raw);
435
671
  }
436
672
 
673
+ /**
674
+ * @module util/logger
675
+ * Logger fallback helper. Provides a unified warn interface that delegates to pino or console.
676
+ */
677
+ /**
678
+ * Return a minimal logger that delegates to pino if available, otherwise console.
679
+ *
680
+ * @param logger - Optional pino logger instance.
681
+ * @returns A minimal logger.
682
+ */
683
+ function getLogger(logger) {
684
+ if (logger)
685
+ return logger;
686
+ return {
687
+ warn(obj, msg) {
688
+ if (msg) {
689
+ console.warn(obj, msg);
690
+ }
691
+ else {
692
+ console.warn(obj);
693
+ }
694
+ },
695
+ };
696
+ }
697
+
698
+ /**
699
+ * @module util/retry
700
+ * Small async retry helper with exponential backoff. Side effects: sleeps between attempts; can invoke onRetry callback for logging.
701
+ */
702
+ function sleep(ms, signal) {
703
+ if (ms <= 0)
704
+ return Promise.resolve();
705
+ return new Promise((resolve, reject) => {
706
+ const timer = setTimeout(() => {
707
+ cleanup();
708
+ resolve();
709
+ }, ms);
710
+ const onAbort = () => {
711
+ cleanup();
712
+ reject(new Error('Retry sleep aborted'));
713
+ };
714
+ const cleanup = () => {
715
+ clearTimeout(timer);
716
+ if (signal)
717
+ signal.removeEventListener('abort', onAbort);
718
+ };
719
+ if (signal) {
720
+ if (signal.aborted) {
721
+ onAbort();
722
+ return;
723
+ }
724
+ signal.addEventListener('abort', onAbort, { once: true });
725
+ }
726
+ });
727
+ }
728
+ function computeDelayMs(attempt, baseDelayMs, maxDelayMs, jitter = 0) {
729
+ const exp = Math.max(0, attempt - 1);
730
+ const raw = Math.min(maxDelayMs, baseDelayMs * 2 ** exp);
731
+ const factor = jitter > 0 ? 1 + Math.random() * jitter : 1;
732
+ return Math.round(raw * factor);
733
+ }
734
+ /**
735
+ * Retry an async operation using exponential backoff.
736
+ *
737
+ * @param fn - Operation to execute.
738
+ * @param options - Retry policy.
739
+ * @returns The operation result.
740
+ */
741
+ async function retry(fn, options) {
742
+ const attempts = Math.max(1, options.attempts);
743
+ let lastError;
744
+ for (let attempt = 1; attempt <= attempts; attempt++) {
745
+ try {
746
+ return await fn(attempt);
747
+ }
748
+ catch (error) {
749
+ lastError = error;
750
+ const isLast = attempt >= attempts;
751
+ if (isLast)
752
+ break;
753
+ const delayMs = computeDelayMs(attempt, options.baseDelayMs, options.maxDelayMs, options.jitter);
754
+ options.onRetry?.({ attempt, attempts, delayMs, error });
755
+ await sleep(delayMs, options.signal);
756
+ }
757
+ }
758
+ throw lastError;
759
+ }
760
+
761
+ /**
762
+ * @module embedding
763
+ *
764
+ * Embedding provider abstractions and registry-backed factory.
765
+ */
437
766
  /**
438
767
  * Create a mock embedding provider that generates deterministic vectors from content hashes.
439
768
  *
@@ -461,14 +790,16 @@
461
790
  * Create a Gemini embedding provider using the Google Generative AI SDK.
462
791
  *
463
792
  * @param config - The embedding configuration.
793
+ * @param logger - Optional pino logger for retry warnings.
464
794
  * @returns A Gemini {@link EmbeddingProvider}.
465
795
  * @throws If the API key is missing.
466
796
  */
467
- function createGeminiProvider(config) {
797
+ function createGeminiProvider(config, logger) {
468
798
  if (!config.apiKey) {
469
799
  throw new Error('Gemini embedding provider requires config.embedding.apiKey');
470
800
  }
471
801
  const dimensions = config.dimensions ?? 3072;
802
+ const log = getLogger(logger);
472
803
  const embedder = new googleGenai.GoogleGenerativeAIEmbeddings({
473
804
  apiKey: config.apiKey,
474
805
  model: config.model,
@@ -476,8 +807,27 @@
476
807
  return {
477
808
  dimensions,
478
809
  async embed(texts) {
479
- // embedDocuments returns vectors for multiple texts
480
- const vectors = await embedder.embedDocuments(texts);
810
+ const vectors = await retry(async (attempt) => {
811
+ if (attempt > 1) {
812
+ log.warn({ attempt, provider: 'gemini', model: config.model }, 'Retrying embedding request');
813
+ }
814
+ // embedDocuments returns vectors for multiple texts
815
+ return embedder.embedDocuments(texts);
816
+ }, {
817
+ attempts: 5,
818
+ baseDelayMs: 500,
819
+ maxDelayMs: 10_000,
820
+ jitter: 0.2,
821
+ onRetry: ({ attempt, delayMs, error }) => {
822
+ log.warn({
823
+ attempt,
824
+ delayMs,
825
+ provider: 'gemini',
826
+ model: config.model,
827
+ error,
828
+ }, 'Embedding call failed; will retry');
829
+ },
830
+ });
481
831
  // Validate dimensions
482
832
  for (const vector of vectors) {
483
833
  if (vector.length !== dimensions) {
@@ -488,25 +838,36 @@
488
838
  },
489
839
  };
490
840
  }
841
+ function createMockFromConfig(config) {
842
+ const dimensions = config.dimensions ?? 768;
843
+ return createMockProvider(dimensions);
844
+ }
845
+ const embeddingProviderRegistry = new Map([
846
+ ['mock', createMockFromConfig],
847
+ ['gemini', createGeminiProvider],
848
+ ]);
491
849
  /**
492
850
  * Create an embedding provider based on the given configuration.
493
851
  *
852
+ * Each provider is responsible for its own default dimensions.
853
+ *
494
854
  * @param config - The embedding configuration.
855
+ * @param logger - Optional pino logger for retry warnings.
495
856
  * @returns An {@link EmbeddingProvider} instance.
496
857
  * @throws If the configured provider is not supported.
497
858
  */
498
- function createEmbeddingProvider(config) {
499
- const dimensions = config.dimensions ?? 768;
500
- switch (config.provider) {
501
- case 'mock':
502
- return createMockProvider(dimensions);
503
- case 'gemini':
504
- return createGeminiProvider(config);
505
- default:
506
- throw new Error(`Unsupported embedding provider: ${config.provider}`);
859
+ function createEmbeddingProvider(config, logger) {
860
+ const factory = embeddingProviderRegistry.get(config.provider);
861
+ if (!factory) {
862
+ throw new Error(`Unsupported embedding provider: ${config.provider}`);
507
863
  }
864
+ return factory(config, logger);
508
865
  }
509
866
 
867
+ /**
868
+ * @module logger
869
+ * Creates pino logger instances. I/O: optionally writes logs to file via pino/file transport. Defaults to stdout at info level.
870
+ */
510
871
  /**
511
872
  * Create a pino logger instance.
512
873
  *
@@ -525,6 +886,45 @@
525
886
  return pino({ level });
526
887
  }
527
888
 
889
+ /**
890
+ * @module hash
891
+ * Provides SHA-256 content hashing. Pure function: given text string, returns hex digest. No I/O or side effects.
892
+ */
893
+ /**
894
+ * Compute a SHA-256 hex digest of the given text.
895
+ *
896
+ * @param text - The input text to hash.
897
+ * @returns The hex-encoded SHA-256 hash.
898
+ */
899
+ function contentHash(text) {
900
+ return node_crypto.createHash('sha256').update(text, 'utf8').digest('hex');
901
+ }
902
+
903
+ /**
904
+ * @module pointId
905
+ * Generates deterministic UUIDv5 point IDs for file paths and chunk indices. Pure function: normalizes paths, returns stable IDs. No I/O.
906
+ */
907
+ /** Namespace UUID for jeeves-watcher point IDs. */
908
+ const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
909
+ /**
910
+ * Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
911
+ *
912
+ * @param filePath - The file path.
913
+ * @param chunkIndex - Optional chunk index within the file.
914
+ * @returns A deterministic UUID v5 string.
915
+ */
916
+ function pointId(filePath, chunkIndex) {
917
+ const key = chunkIndex !== undefined
918
+ ? `${normalizePath(filePath)}#${String(chunkIndex)}`
919
+ : normalizePath(filePath);
920
+ return uuid.v5(key, NAMESPACE);
921
+ }
922
+
923
+ /**
924
+ * @module extractors
925
+ *
926
+ * Text extraction registry for supported file formats.
927
+ */
528
928
  /**
529
929
  * Extract YAML frontmatter from a Markdown document.
530
930
  *
@@ -570,6 +970,55 @@
570
970
  }
571
971
  return JSON.stringify(obj);
572
972
  }
973
+ async function extractMarkdown(filePath) {
974
+ const raw = await promises.readFile(filePath, 'utf8');
975
+ const { frontmatter, body } = extractMarkdownFrontmatter(raw);
976
+ return { text: body, frontmatter };
977
+ }
978
+ async function extractPlaintext(filePath) {
979
+ const raw = await promises.readFile(filePath, 'utf8');
980
+ return { text: raw };
981
+ }
982
+ async function extractJson(filePath) {
983
+ const raw = await promises.readFile(filePath, 'utf8');
984
+ const parsed = JSON.parse(raw);
985
+ const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
986
+ ? parsed
987
+ : undefined;
988
+ return { text: extractJsonText(parsed), json };
989
+ }
990
+ async function extractPdf(filePath) {
991
+ const buffer = await promises.readFile(filePath);
992
+ const uint8Array = new Uint8Array(buffer);
993
+ const { extractText: extractPdfText } = await import('unpdf');
994
+ const { text } = await extractPdfText(uint8Array);
995
+ // unpdf returns an array of strings (one per page)
996
+ const content = Array.isArray(text) ? text.join('\n\n') : text;
997
+ return { text: content };
998
+ }
999
+ async function extractDocx(filePath) {
1000
+ const buffer = await promises.readFile(filePath);
1001
+ const result = await mammoth.extractRawText({ buffer });
1002
+ return { text: result.value };
1003
+ }
1004
+ async function extractHtml(filePath) {
1005
+ const raw = await promises.readFile(filePath, 'utf8');
1006
+ const $ = cheerio__namespace.load(raw);
1007
+ $('script, style').remove();
1008
+ const text = $('body').text().trim() || $.text().trim();
1009
+ return { text };
1010
+ }
1011
+ const extractorRegistry = new Map([
1012
+ ['.md', extractMarkdown],
1013
+ ['.markdown', extractMarkdown],
1014
+ ['.txt', extractPlaintext],
1015
+ ['.text', extractPlaintext],
1016
+ ['.json', extractJson],
1017
+ ['.pdf', extractPdf],
1018
+ ['.docx', extractDocx],
1019
+ ['.html', extractHtml],
1020
+ ['.htm', extractHtml],
1021
+ ]);
573
1022
  /**
574
1023
  * Extract text from a file based on extension.
575
1024
  *
@@ -578,87 +1027,132 @@
578
1027
  * @returns Extracted text and optional structured data.
579
1028
  */
580
1029
  async function extractText(filePath, extension) {
581
- const ext = extension.toLowerCase();
582
- if (ext === '.md' || ext === '.markdown') {
583
- const raw = await promises.readFile(filePath, 'utf8');
584
- const { frontmatter, body } = extractMarkdownFrontmatter(raw);
585
- return { text: body, frontmatter };
586
- }
587
- if (ext === '.txt' || ext === '.text') {
588
- const raw = await promises.readFile(filePath, 'utf8');
589
- return { text: raw };
590
- }
591
- if (ext === '.json') {
592
- const raw = await promises.readFile(filePath, 'utf8');
593
- const parsed = JSON.parse(raw);
594
- const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
595
- ? parsed
596
- : undefined;
597
- return { text: extractJsonText(parsed), json };
598
- }
599
- if (ext === '.pdf') {
600
- const buffer = await promises.readFile(filePath);
601
- const uint8Array = new Uint8Array(buffer);
602
- const { extractText: extractPdfText } = await import('unpdf');
603
- const { text } = await extractPdfText(uint8Array);
604
- // unpdf returns an array of strings (one per page)
605
- const content = Array.isArray(text) ? text.join('\n\n') : text;
606
- return { text: content };
607
- }
608
- if (ext === '.docx') {
609
- const buffer = await promises.readFile(filePath);
610
- const result = await mammoth.extractRawText({ buffer });
611
- return { text: result.value };
612
- }
613
- if (ext === '.html' || ext === '.htm') {
614
- const raw = await promises.readFile(filePath, 'utf8');
615
- const $ = cheerio__namespace.load(raw);
616
- // Remove script and style elements
617
- $('script, style').remove();
618
- // Extract text content
619
- const text = $('body').text().trim() || $.text().trim();
620
- return { text };
621
- }
1030
+ const extractor = extractorRegistry.get(extension.toLowerCase());
1031
+ if (extractor)
1032
+ return extractor(filePath);
622
1033
  // Default: treat as plaintext.
623
- const raw = await promises.readFile(filePath, 'utf8');
624
- return { text: raw };
1034
+ return extractPlaintext(filePath);
625
1035
  }
626
1036
 
627
1037
  /**
628
- * Compute a SHA-256 hex digest of the given text.
1038
+ * @module rules/templates
1039
+ * Resolves template variables (`${path.to.value}`) in rule `set` objects against file attributes.
1040
+ */
1041
+ /**
1042
+ * Resolve `${template.vars}` in a value against the given attributes.
629
1043
  *
630
- * @param text - The input text to hash.
631
- * @returns The hex-encoded SHA-256 hash.
1044
+ * @param value - The value to resolve.
1045
+ * @param attributes - The file attributes for variable lookup.
1046
+ * @returns The resolved value.
632
1047
  */
633
- function contentHash(text) {
634
- return node_crypto.createHash('sha256').update(text, 'utf8').digest('hex');
1048
+ function resolveTemplateVars(value, attributes) {
1049
+ if (typeof value !== 'string')
1050
+ return value;
1051
+ return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
1052
+ const current = radash.get(attributes, varPath);
1053
+ if (current === null || current === undefined)
1054
+ return '';
1055
+ return typeof current === 'string' ? current : JSON.stringify(current);
1056
+ });
1057
+ }
1058
+ /**
1059
+ * Resolve all template variables in a `set` object.
1060
+ *
1061
+ * @param setObj - The key-value pairs to resolve.
1062
+ * @param attributes - The file attributes for variable lookup.
1063
+ * @returns The resolved key-value pairs.
1064
+ */
1065
+ function resolveSet(setObj, attributes) {
1066
+ const result = {};
1067
+ for (const [key, value] of Object.entries(setObj)) {
1068
+ result[key] = resolveTemplateVars(value, attributes);
1069
+ }
1070
+ return result;
635
1071
  }
636
1072
 
637
- /** Namespace UUID for jeeves-watcher point IDs. */
638
- const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
639
1073
  /**
640
- * Normalise a file path for deterministic point ID generation.
1074
+ * @module rules/apply
1075
+ * Applies compiled inference rules to file attributes, producing merged metadata via template resolution and JsonMap transforms.
1076
+ */
1077
+ /**
1078
+ * Create the lib object for JsonMap transformations.
641
1079
  *
642
- * @param filePath - The original file path.
643
- * @returns The normalised path string.
1080
+ * @returns The lib object.
644
1081
  */
645
- function normalisePath(filePath) {
646
- return filePath.replace(/\\/g, '/').toLowerCase();
1082
+ function createJsonMapLib() {
1083
+ return {
1084
+ split: (str, separator) => str.split(separator),
1085
+ slice: (arr, start, end) => arr.slice(start, end),
1086
+ join: (arr, separator) => arr.join(separator),
1087
+ toLowerCase: (str) => str.toLowerCase(),
1088
+ replace: (str, search, replacement) => str.replace(search, replacement),
1089
+ get: (obj, path) => radash.get(obj, path),
1090
+ };
647
1091
  }
648
1092
  /**
649
- * Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
1093
+ * Apply compiled inference rules to file attributes, returning merged metadata.
650
1094
  *
651
- * @param filePath - The file path.
652
- * @param chunkIndex - Optional chunk index within the file.
653
- * @returns A deterministic UUID v5 string.
1095
+ * Rules are evaluated in order; later rules override earlier ones.
1096
+ * If a rule has a `map`, the JsonMap transformation is applied after `set` resolution,
1097
+ * and map output overrides set output on conflict.
1098
+ *
1099
+ * @param compiledRules - The compiled rules to evaluate.
1100
+ * @param attributes - The file attributes to match against.
1101
+ * @param namedMaps - Optional record of named JsonMap definitions.
1102
+ * @param logger - Optional logger for warnings (falls back to console.warn).
1103
+ * @returns The merged metadata from all matching rules.
654
1104
  */
655
- function pointId(filePath, chunkIndex) {
656
- const key = chunkIndex !== undefined
657
- ? `${normalisePath(filePath)}#${String(chunkIndex)}`
658
- : normalisePath(filePath);
659
- return uuid.v5(key, NAMESPACE);
1105
+ async function applyRules(compiledRules, attributes, namedMaps, logger) {
1106
+ // JsonMap's type definitions expect a generic JsonMapLib shape with unary functions.
1107
+ // Our helper functions accept multiple args, which JsonMap supports at runtime.
1108
+ const lib = createJsonMapLib();
1109
+ let merged = {};
1110
+ const log = logger ?? console;
1111
+ for (const { rule, validate } of compiledRules) {
1112
+ if (validate(attributes)) {
1113
+ // Apply set resolution
1114
+ const setOutput = resolveSet(rule.set, attributes);
1115
+ merged = { ...merged, ...setOutput };
1116
+ // Apply map transformation if present
1117
+ if (rule.map) {
1118
+ let mapDef;
1119
+ // Resolve map reference
1120
+ if (typeof rule.map === 'string') {
1121
+ mapDef = namedMaps?.[rule.map];
1122
+ if (!mapDef) {
1123
+ log.warn(`Map reference "${rule.map}" not found in named maps. Skipping map transformation.`);
1124
+ continue;
1125
+ }
1126
+ }
1127
+ else {
1128
+ mapDef = rule.map;
1129
+ }
1130
+ // Execute JsonMap transformation
1131
+ try {
1132
+ const jsonMap = new jsonmap.JsonMap(mapDef, lib);
1133
+ const mapOutput = await jsonMap.transform(attributes);
1134
+ if (mapOutput &&
1135
+ typeof mapOutput === 'object' &&
1136
+ !Array.isArray(mapOutput)) {
1137
+ merged = { ...merged, ...mapOutput };
1138
+ }
1139
+ else {
1140
+ log.warn(`JsonMap transformation did not return an object; skipping merge.`);
1141
+ }
1142
+ }
1143
+ catch (error) {
1144
+ log.warn(`JsonMap transformation failed: ${error instanceof Error ? error.message : String(error)}`);
1145
+ }
1146
+ }
1147
+ }
1148
+ }
1149
+ return merged;
660
1150
  }
661
1151
 
1152
+ /**
1153
+ * @module rules/attributes
1154
+ * Builds file attribute objects for rule matching. Pure function: derives attributes from path, stats, and extracted data.
1155
+ */
662
1156
  /**
663
1157
  * Build {@link FileAttributes} from a file path and stat info.
664
1158
  *
@@ -686,10 +1180,15 @@
686
1180
  attrs.json = extractedJson;
687
1181
  return attrs;
688
1182
  }
1183
+
1184
+ /**
1185
+ * @module rules/ajvSetup
1186
+ * AJV instance factory with custom glob keyword for picomatch-based pattern matching in rule schemas.
1187
+ */
689
1188
  /**
690
- * Create an ajv instance with a custom `glob` format for picomatch glob matching.
1189
+ * Create an AJV instance with a custom `glob` format for picomatch glob matching.
691
1190
  *
692
- * @returns The configured ajv instance.
1191
+ * @returns The configured AJV instance.
693
1192
  */
694
1193
  function createRuleAjv() {
695
1194
  const ajv = new Ajv({ allErrors: true });
@@ -702,6 +1201,11 @@
702
1201
  });
703
1202
  return ajv;
704
1203
  }
1204
+
1205
+ /**
1206
+ * @module rules/compile
1207
+ * Compiles inference rule definitions into executable AJV validators for efficient rule evaluation.
1208
+ */
705
1209
  /**
706
1210
  * Compile an array of inference rules into executable validators.
707
1211
  *
@@ -718,62 +1222,95 @@
718
1222
  }),
719
1223
  }));
720
1224
  }
1225
+
1226
+ /**
1227
+ * @module processor/buildMetadata
1228
+ * Builds merged metadata from file content, inference rules, and enrichment. I/O: reads files, extracts text, loads enrichment .meta.json.
1229
+ */
721
1230
  /**
722
- * Resolve `$\{template.vars\}` in a value against the given attributes.
1231
+ * Build merged metadata for a file by applying inference rules and merging with enrichment metadata.
723
1232
  *
724
- * @param value - The value to resolve.
725
- * @param attributes - The file attributes for variable lookup.
726
- * @returns The resolved value.
1233
+ * @param filePath - The file to process.
1234
+ * @param compiledRules - The compiled inference rules.
1235
+ * @param metadataDir - The metadata directory for enrichment files.
1236
+ * @param maps - Optional named JsonMap definitions.
1237
+ * @param logger - Optional logger for rule warnings.
1238
+ * @returns The merged metadata and intermediate data.
727
1239
  */
728
- function resolveTemplateVars(value, attributes) {
729
- if (typeof value !== 'string')
730
- return value;
731
- return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
732
- const parts = varPath.split('.');
733
- let current = attributes;
734
- for (const part of parts) {
735
- if (current === null || current === undefined)
736
- return '';
737
- current = current[part];
738
- }
739
- if (current === null || current === undefined)
740
- return '';
741
- return typeof current === 'string' ? current : JSON.stringify(current);
742
- });
1240
+ async function buildMergedMetadata(filePath, compiledRules, metadataDir, maps, logger) {
1241
+ const ext = node_path.extname(filePath);
1242
+ const stats = await promises.stat(filePath);
1243
+ // 1. Extract text and structured data
1244
+ const extracted = await extractText(filePath, ext);
1245
+ // 2. Build attributes + apply rules
1246
+ const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
1247
+ const inferred = await applyRules(compiledRules, attributes, maps, logger);
1248
+ // 3. Read enrichment metadata (merge, enrichment wins)
1249
+ const enrichment = await readMetadata(filePath, metadataDir);
1250
+ const metadata = {
1251
+ ...inferred,
1252
+ ...(enrichment ?? {}),
1253
+ };
1254
+ return { inferred, enrichment, metadata, attributes, extracted };
743
1255
  }
1256
+
744
1257
  /**
745
- * Resolve all template variables in a `set` object.
1258
+ * @module processor/chunkIds
1259
+ * Generates chunk point IDs from file paths and chunk indices. Extracts chunk counts from Qdrant payloads. Pure functions, no I/O.
1260
+ */
1261
+ /**
1262
+ * Generate an array of chunk IDs for a file.
746
1263
  *
747
- * @param setObj - The key-value pairs to resolve.
748
- * @param attributes - The file attributes for variable lookup.
749
- * @returns The resolved key-value pairs.
1264
+ * @param filePath - The file path.
1265
+ * @param totalChunks - The total number of chunks.
1266
+ * @returns An array of point IDs for each chunk.
750
1267
  */
751
- function resolveSet(setObj, attributes) {
752
- const result = {};
753
- for (const [key, value] of Object.entries(setObj)) {
754
- result[key] = resolveTemplateVars(value, attributes);
1268
+ function chunkIds(filePath, totalChunks) {
1269
+ const ids = [];
1270
+ for (let i = 0; i < totalChunks; i++) {
1271
+ ids.push(pointId(filePath, i));
755
1272
  }
756
- return result;
1273
+ return ids;
757
1274
  }
758
1275
  /**
759
- * Apply compiled inference rules to file attributes, returning merged metadata.
1276
+ * Extract the total chunk count from a payload, with a fallback.
760
1277
  *
761
- * Rules are evaluated in order; later rules override earlier ones.
1278
+ * @param payload - The Qdrant point payload (or null).
1279
+ * @param fallback - The fallback value if total_chunks is missing or invalid.
1280
+ * @returns The total chunk count.
1281
+ */
1282
+ function getChunkCount(payload, fallback = 1) {
1283
+ if (!payload)
1284
+ return fallback;
1285
+ const count = payload['total_chunks'];
1286
+ return typeof count === 'number' ? count : fallback;
1287
+ }
1288
+
1289
+ /**
1290
+ * @module processor/splitter
1291
+ * Factory for LangChain text splitters. Returns MarkdownTextSplitter or RecursiveCharacterTextSplitter based on file extension. No I/O.
1292
+ */
1293
+ /**
1294
+ * Create the appropriate text splitter for the given file extension.
762
1295
  *
763
- * @param compiledRules - The compiled rules to evaluate.
764
- * @param attributes - The file attributes to match against.
765
- * @returns The merged metadata from all matching rules.
1296
+ * @param ext - File extension (including leading dot).
1297
+ * @param chunkSize - Maximum chunk size in characters.
1298
+ * @param chunkOverlap - Overlap between chunks in characters.
1299
+ * @returns A text splitter instance.
766
1300
  */
767
- function applyRules(compiledRules, attributes) {
768
- let merged = {};
769
- for (const { rule, validate } of compiledRules) {
770
- if (validate(attributes)) {
771
- merged = { ...merged, ...resolveSet(rule.set, attributes) };
772
- }
1301
+ function createSplitter(ext, chunkSize, chunkOverlap) {
1302
+ const lowerExt = ext.toLowerCase();
1303
+ if (lowerExt === '.md' || lowerExt === '.markdown') {
1304
+ return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
773
1305
  }
774
- return merged;
1306
+ return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
775
1307
  }
776
1308
 
1309
+ /**
1310
+ * @module processor
1311
+ *
1312
+ * Core document processing pipeline. Handles extracting text, computing embeddings, syncing with vector store.
1313
+ */
777
1314
  /**
778
1315
  * Core document processing pipeline.
779
1316
  *
@@ -785,11 +1322,10 @@
785
1322
  vectorStore;
786
1323
  compiledRules;
787
1324
  logger;
788
- metadataDir;
789
1325
  /**
790
1326
  * Create a new DocumentProcessor.
791
1327
  *
792
- * @param config - The application configuration.
1328
+ * @param config - The processor configuration.
793
1329
  * @param embeddingProvider - The embedding provider.
794
1330
  * @param vectorStore - The vector store client.
795
1331
  * @param compiledRules - The compiled inference rules.
@@ -801,7 +1337,6 @@
801
1337
  this.vectorStore = vectorStore;
802
1338
  this.compiledRules = compiledRules;
803
1339
  this.logger = logger;
804
- this.metadataDir = config.metadataDir ?? '.jeeves-metadata';
805
1340
  }
806
1341
  /**
807
1342
  * Process a file through the full pipeline: extract, hash, chunk, embed, upsert.
@@ -811,9 +1346,8 @@
811
1346
  async processFile(filePath) {
812
1347
  try {
813
1348
  const ext = node_path.extname(filePath);
814
- const stats = await promises.stat(filePath);
815
- // 1. Extract text
816
- const extracted = await extractText(filePath, ext);
1349
+ // 1. Build merged metadata + extract text
1350
+ const { metadata, extracted } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
817
1351
  if (!extracted.text.trim()) {
818
1352
  this.logger.debug({ filePath }, 'Skipping empty file');
819
1353
  return;
@@ -826,26 +1360,15 @@
826
1360
  this.logger.debug({ filePath }, 'Content unchanged, skipping');
827
1361
  return;
828
1362
  }
829
- const oldTotalChunks = typeof existingPayload?.['total_chunks'] === 'number'
830
- ? existingPayload['total_chunks']
831
- : 0;
832
- // 3. Build attributes + apply rules → inferred metadata
833
- const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
834
- const inferred = applyRules(this.compiledRules, attributes);
835
- // 4. Read enrichment metadata (merge, enrichment wins)
836
- const enrichment = await readMetadata(filePath, this.metadataDir);
837
- const metadata = {
838
- ...inferred,
839
- ...(enrichment ?? {}),
840
- };
841
- // 5. Chunk text
842
- const chunkSize = this.config.embedding.chunkSize ?? 1000;
843
- const chunkOverlap = this.config.embedding.chunkOverlap ?? 200;
844
- const splitter = this.createSplitter(ext, chunkSize, chunkOverlap);
1363
+ const oldTotalChunks = getChunkCount(existingPayload);
1364
+ // 3. Chunk text
1365
+ const chunkSize = this.config.chunkSize ?? 1000;
1366
+ const chunkOverlap = this.config.chunkOverlap ?? 200;
1367
+ const splitter = createSplitter(ext, chunkSize, chunkOverlap);
845
1368
  const chunks = await splitter.splitText(extracted.text);
846
- // 6. Embed all chunks
1369
+ // 4. Embed all chunks
847
1370
  const vectors = await this.embeddingProvider.embed(chunks);
848
- // 7. Upsert all chunk points
1371
+ // 5. Upsert all chunk points
849
1372
  const points = chunks.map((chunk, i) => ({
850
1373
  id: pointId(filePath, i),
851
1374
  vector: vectors[i],
@@ -859,12 +1382,9 @@
859
1382
  },
860
1383
  }));
861
1384
  await this.vectorStore.upsert(points);
862
- // 8. Clean up orphaned chunks
1385
+ // 6. Clean up orphaned chunks
863
1386
  if (oldTotalChunks > chunks.length) {
864
- const orphanIds = [];
865
- for (let i = chunks.length; i < oldTotalChunks; i++) {
866
- orphanIds.push(pointId(filePath, i));
867
- }
1387
+ const orphanIds = chunkIds(filePath, oldTotalChunks).slice(chunks.length);
868
1388
  await this.vectorStore.delete(orphanIds);
869
1389
  }
870
1390
  this.logger.info({ filePath, chunks: chunks.length }, 'File processed successfully');
@@ -883,15 +1403,10 @@
883
1403
  // Get the existing payload to find total chunks
884
1404
  const baseId = pointId(filePath, 0);
885
1405
  const existingPayload = await this.vectorStore.getPayload(baseId);
886
- const totalChunks = typeof existingPayload?.['total_chunks'] === 'number'
887
- ? existingPayload['total_chunks']
888
- : 1;
889
- const ids = [];
890
- for (let i = 0; i < totalChunks; i++) {
891
- ids.push(pointId(filePath, i));
892
- }
1406
+ const totalChunks = getChunkCount(existingPayload);
1407
+ const ids = chunkIds(filePath, totalChunks);
893
1408
  await this.vectorStore.delete(ids);
894
- await deleteMetadata(filePath, this.metadataDir);
1409
+ await deleteMetadata(filePath, this.config.metadataDir);
895
1410
  this.logger.info({ filePath }, 'File deleted from index');
896
1411
  }
897
1412
  catch (error) {
@@ -908,21 +1423,16 @@
908
1423
  async processMetadataUpdate(filePath, metadata) {
909
1424
  try {
910
1425
  // Read existing enrichment metadata and merge
911
- const existing = (await readMetadata(filePath, this.metadataDir)) ?? {};
1426
+ const existing = (await readMetadata(filePath, this.config.metadataDir)) ?? {};
912
1427
  const merged = { ...existing, ...metadata };
913
- await writeMetadata(filePath, this.metadataDir, merged);
1428
+ await writeMetadata(filePath, this.config.metadataDir, merged);
914
1429
  // Update all chunk payloads in Qdrant
915
1430
  const baseId = pointId(filePath, 0);
916
1431
  const existingPayload = await this.vectorStore.getPayload(baseId);
917
1432
  if (!existingPayload)
918
1433
  return null;
919
- const totalChunks = typeof existingPayload['total_chunks'] === 'number'
920
- ? existingPayload['total_chunks']
921
- : 1;
922
- const ids = [];
923
- for (let i = 0; i < totalChunks; i++) {
924
- ids.push(pointId(filePath, i));
925
- }
1434
+ const totalChunks = getChunkCount(existingPayload);
1435
+ const ids = chunkIds(filePath, totalChunks);
926
1436
  await this.vectorStore.setPayload(ids, merged);
927
1437
  this.logger.info({ filePath, chunks: totalChunks }, 'Metadata updated');
928
1438
  return merged;
@@ -948,27 +1458,11 @@
948
1458
  this.logger.debug({ filePath }, 'File not indexed, skipping');
949
1459
  return null;
950
1460
  }
951
- const ext = node_path.extname(filePath);
952
- const stats = await promises.stat(filePath);
953
- // Extract frontmatter/json for attribute building (lightweight)
954
- const extracted = await extractText(filePath, ext);
955
- // Build attributes + apply current rules
956
- const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
957
- const inferred = applyRules(this.compiledRules, attributes);
958
- // Read enrichment metadata (merge, enrichment wins)
959
- const enrichment = await readMetadata(filePath, this.metadataDir);
960
- const metadata = {
961
- ...inferred,
962
- ...(enrichment ?? {}),
963
- };
1461
+ // Build merged metadata (lightweight — no embedding)
1462
+ const { metadata } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
964
1463
  // Update all chunk payloads
965
- const totalChunks = typeof existingPayload['total_chunks'] === 'number'
966
- ? existingPayload['total_chunks']
967
- : 1;
968
- const ids = [];
969
- for (let i = 0; i < totalChunks; i++) {
970
- ids.push(pointId(filePath, i));
971
- }
1464
+ const totalChunks = getChunkCount(existingPayload);
1465
+ const ids = chunkIds(filePath, totalChunks);
972
1466
  await this.vectorStore.setPayload(ids, metadata);
973
1467
  this.logger.info({ filePath, chunks: totalChunks }, 'Rules re-applied');
974
1468
  return metadata;
@@ -987,23 +1481,12 @@
987
1481
  this.compiledRules = compiledRules;
988
1482
  this.logger.info({ rules: compiledRules.length }, 'Inference rules updated');
989
1483
  }
990
- /**
991
- * Create the appropriate text splitter for the given file extension.
992
- *
993
- * @param ext - File extension.
994
- * @param chunkSize - Maximum chunk size in characters.
995
- * @param chunkOverlap - Overlap between chunks in characters.
996
- * @returns A text splitter instance.
997
- */
998
- createSplitter(ext, chunkSize, chunkOverlap) {
999
- const lowerExt = ext.toLowerCase();
1000
- if (lowerExt === '.md' || lowerExt === '.markdown') {
1001
- return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
1002
- }
1003
- return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
1004
- }
1005
1484
  }
1006
1485
 
1486
+ /**
1487
+ * @module queue
1488
+ * Debounced, rate-limited, concurrent event queue for file watchers. Manages priority queuing and async callbacks. No direct I/O; orchestrates processing.
1489
+ */
1007
1490
  /**
1008
1491
  * A debounced, rate-limited, concurrent event queue.
1009
1492
  */
@@ -1152,19 +1635,23 @@
1152
1635
  client;
1153
1636
  collectionName;
1154
1637
  dims;
1638
+ log;
1155
1639
  /**
1156
1640
  * Create a new VectorStoreClient.
1157
1641
  *
1158
1642
  * @param config - Vector store configuration.
1159
1643
  * @param dimensions - The embedding vector dimensions.
1644
+ * @param logger - Optional pino logger for retry warnings.
1160
1645
  */
1161
- constructor(config, dimensions) {
1646
+ constructor(config, dimensions, logger) {
1162
1647
  this.client = new jsClientRest.QdrantClient({
1163
1648
  url: config.url,
1164
1649
  apiKey: config.apiKey,
1650
+ checkCompatibility: false,
1165
1651
  });
1166
1652
  this.collectionName = config.collectionName;
1167
1653
  this.dims = dimensions;
1654
+ this.log = getLogger(logger);
1168
1655
  }
1169
1656
  /**
1170
1657
  * Ensure the collection exists with correct dimensions and Cosine distance.
@@ -1191,13 +1678,26 @@
1191
1678
  async upsert(points) {
1192
1679
  if (points.length === 0)
1193
1680
  return;
1194
- await this.client.upsert(this.collectionName, {
1195
- wait: true,
1196
- points: points.map((p) => ({
1197
- id: p.id,
1198
- vector: p.vector,
1199
- payload: p.payload,
1200
- })),
1681
+ await retry(async (attempt) => {
1682
+ if (attempt > 1) {
1683
+ this.log.warn({ attempt, operation: 'qdrant.upsert', points: points.length }, 'Retrying Qdrant upsert');
1684
+ }
1685
+ await this.client.upsert(this.collectionName, {
1686
+ wait: true,
1687
+ points: points.map((p) => ({
1688
+ id: p.id,
1689
+ vector: p.vector,
1690
+ payload: p.payload,
1691
+ })),
1692
+ });
1693
+ }, {
1694
+ attempts: 5,
1695
+ baseDelayMs: 500,
1696
+ maxDelayMs: 10_000,
1697
+ jitter: 0.2,
1698
+ onRetry: ({ attempt, delayMs, error }) => {
1699
+ this.log.warn({ attempt, delayMs, operation: 'qdrant.upsert', error }, 'Qdrant upsert failed; will retry');
1700
+ },
1201
1701
  });
1202
1702
  }
1203
1703
  /**
@@ -1208,9 +1708,22 @@
1208
1708
  async delete(ids) {
1209
1709
  if (ids.length === 0)
1210
1710
  return;
1211
- await this.client.delete(this.collectionName, {
1212
- wait: true,
1213
- points: ids,
1711
+ await retry(async (attempt) => {
1712
+ if (attempt > 1) {
1713
+ this.log.warn({ attempt, operation: 'qdrant.delete', ids: ids.length }, 'Retrying Qdrant delete');
1714
+ }
1715
+ await this.client.delete(this.collectionName, {
1716
+ wait: true,
1717
+ points: ids,
1718
+ });
1719
+ }, {
1720
+ attempts: 5,
1721
+ baseDelayMs: 500,
1722
+ maxDelayMs: 10_000,
1723
+ jitter: 0.2,
1724
+ onRetry: ({ attempt, delayMs, error }) => {
1725
+ this.log.warn({ attempt, delayMs, operation: 'qdrant.delete', error }, 'Qdrant delete failed; will retry');
1726
+ },
1214
1727
  });
1215
1728
  }
1216
1729
  /**
@@ -1310,6 +1823,10 @@
1310
1823
  }
1311
1824
  }
1312
1825
 
1826
+ /**
1827
+ * @module watcher
1828
+ * Filesystem watcher wrapping chokidar. I/O: watches files/directories for add/change/unlink events, enqueues to processing queue.
1829
+ */
1313
1830
  /**
1314
1831
  * Filesystem watcher that maps chokidar events to the processing queue.
1315
1832
  */
@@ -1376,57 +1893,141 @@
1376
1893
  }
1377
1894
  }
1378
1895
 
1896
+ /**
1897
+ * @module app/configWatcher
1898
+ * Watches the config file for changes and triggers debounced reload. Isolated I/O wrapper around chokidar.
1899
+ */
1900
+ /**
1901
+ * Debounced config file watcher.
1902
+ */
1903
+ class ConfigWatcher {
1904
+ options;
1905
+ watcher;
1906
+ debounce;
1907
+ constructor(options) {
1908
+ this.options = options;
1909
+ }
1910
+ start() {
1911
+ if (!this.options.enabled)
1912
+ return;
1913
+ this.watcher = chokidar.watch(this.options.configPath, {
1914
+ ignoreInitial: true,
1915
+ });
1916
+ this.watcher.on('change', () => {
1917
+ if (this.debounce)
1918
+ clearTimeout(this.debounce);
1919
+ this.debounce = setTimeout(() => {
1920
+ void this.options.onChange();
1921
+ }, this.options.debounceMs);
1922
+ });
1923
+ this.watcher.on('error', (error) => {
1924
+ this.options.logger.error({ error }, 'Config watcher error');
1925
+ });
1926
+ this.options.logger.info({
1927
+ configPath: this.options.configPath,
1928
+ debounceMs: this.options.debounceMs,
1929
+ }, 'Config watcher started');
1930
+ }
1931
+ async stop() {
1932
+ if (this.debounce) {
1933
+ clearTimeout(this.debounce);
1934
+ this.debounce = undefined;
1935
+ }
1936
+ if (this.watcher) {
1937
+ await this.watcher.close();
1938
+ this.watcher = undefined;
1939
+ }
1940
+ }
1941
+ }
1942
+
1943
+ /**
1944
+ * @module app/shutdown
1945
+ * Process signal shutdown orchestration. Installs SIGINT/SIGTERM handlers that invoke a provided async stop function.
1946
+ */
1947
+ /**
1948
+ * Install process signal handlers.
1949
+ *
1950
+ * @param stop - Async stop function to invoke on shutdown signals.
1951
+ */
1952
+ function installShutdownHandlers(stop) {
1953
+ const shutdown = async () => {
1954
+ await stop();
1955
+ process.exit(0);
1956
+ };
1957
+ process.on('SIGTERM', () => void shutdown());
1958
+ process.on('SIGINT', () => void shutdown());
1959
+ }
1960
+
1961
+ const defaultFactories = {
1962
+ loadConfig,
1963
+ createLogger,
1964
+ createEmbeddingProvider,
1965
+ createVectorStoreClient: (config, dimensions, logger) => new VectorStoreClient(config, dimensions, logger),
1966
+ compileRules,
1967
+ createDocumentProcessor: (config, embeddingProvider, vectorStore, compiledRules, logger) => new DocumentProcessor(config, embeddingProvider, vectorStore, compiledRules, logger),
1968
+ createEventQueue: (options) => new EventQueue(options),
1969
+ createFileSystemWatcher: (config, queue, processor, logger) => new FileSystemWatcher(config, queue, processor, logger),
1970
+ createApiServer,
1971
+ };
1379
1972
  /**
1380
1973
  * Main application class that wires together all components.
1381
1974
  */
1382
1975
  class JeevesWatcher {
1383
1976
  config;
1384
1977
  configPath;
1978
+ factories;
1385
1979
  logger;
1386
1980
  watcher;
1387
1981
  queue;
1388
1982
  server;
1389
1983
  processor;
1390
1984
  configWatcher;
1391
- configDebounce;
1392
1985
  /**
1393
1986
  * Create a new JeevesWatcher instance.
1394
1987
  *
1395
1988
  * @param config - The application configuration.
1396
1989
  * @param configPath - Optional config file path to watch for changes.
1990
+ * @param factories - Optional component factories (for dependency injection).
1397
1991
  */
1398
- constructor(config, configPath) {
1992
+ constructor(config, configPath, factories = {}) {
1399
1993
  this.config = config;
1400
1994
  this.configPath = configPath;
1995
+ this.factories = { ...defaultFactories, ...factories };
1401
1996
  }
1402
1997
  /**
1403
1998
  * Start the watcher, API server, and all components.
1404
1999
  */
1405
2000
  async start() {
1406
- const logger = createLogger(this.config.logging);
2001
+ const logger = this.factories.createLogger(this.config.logging);
1407
2002
  this.logger = logger;
1408
2003
  let embeddingProvider;
1409
2004
  try {
1410
- embeddingProvider = createEmbeddingProvider(this.config.embedding);
2005
+ embeddingProvider = this.factories.createEmbeddingProvider(this.config.embedding, logger);
1411
2006
  }
1412
2007
  catch (error) {
1413
2008
  logger.fatal({ error }, 'Failed to create embedding provider');
1414
2009
  throw error;
1415
2010
  }
1416
- const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions);
2011
+ const vectorStore = this.factories.createVectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions, logger);
1417
2012
  await vectorStore.ensureCollection();
1418
- const compiledRules = compileRules(this.config.inferenceRules ?? []);
1419
- const processor = new DocumentProcessor(this.config, embeddingProvider, vectorStore, compiledRules, logger);
2013
+ const compiledRules = this.factories.compileRules(this.config.inferenceRules ?? []);
2014
+ const processorConfig = {
2015
+ metadataDir: this.config.metadataDir ?? '.jeeves-metadata',
2016
+ chunkSize: this.config.embedding.chunkSize,
2017
+ chunkOverlap: this.config.embedding.chunkOverlap,
2018
+ maps: this.config.maps,
2019
+ };
2020
+ const processor = this.factories.createDocumentProcessor(processorConfig, embeddingProvider, vectorStore, compiledRules, logger);
1420
2021
  this.processor = processor;
1421
- const queue = new EventQueue({
2022
+ const queue = this.factories.createEventQueue({
1422
2023
  debounceMs: this.config.watch.debounceMs ?? 2000,
1423
2024
  concurrency: this.config.embedding.concurrency ?? 5,
1424
2025
  rateLimitPerMinute: this.config.embedding.rateLimitPerMinute,
1425
2026
  });
1426
2027
  this.queue = queue;
1427
- const watcher = new FileSystemWatcher(this.config.watch, queue, processor, logger);
2028
+ const watcher = this.factories.createFileSystemWatcher(this.config.watch, queue, processor, logger);
1428
2029
  this.watcher = watcher;
1429
- const server = createApiServer({
2030
+ const server = this.factories.createApiServer({
1430
2031
  processor,
1431
2032
  vectorStore,
1432
2033
  embeddingProvider,
@@ -1437,7 +2038,7 @@
1437
2038
  this.server = server;
1438
2039
  await server.listen({
1439
2040
  host: this.config.api?.host ?? '127.0.0.1',
1440
- port: this.config.api?.port ?? 3458,
2041
+ port: this.config.api?.port ?? 3456,
1441
2042
  });
1442
2043
  watcher.start();
1443
2044
  this.startConfigWatch();
@@ -1453,12 +2054,17 @@
1453
2054
  }
1454
2055
  if (this.queue) {
1455
2056
  const timeout = this.config.shutdownTimeoutMs ?? 10000;
1456
- await Promise.race([
1457
- this.queue.drain(),
2057
+ const drained = await Promise.race([
2058
+ this.queue.drain().then(() => true),
1458
2059
  new Promise((resolve) => {
1459
- setTimeout(resolve, timeout);
2060
+ setTimeout(() => {
2061
+ resolve(false);
2062
+ }, timeout);
1460
2063
  }),
1461
2064
  ]);
2065
+ if (!drained) {
2066
+ this.logger?.warn({ timeoutMs: timeout }, 'Queue drain timeout hit, forcing shutdown');
2067
+ }
1462
2068
  }
1463
2069
  if (this.server) {
1464
2070
  await this.server.close();
@@ -1477,28 +2083,18 @@
1477
2083
  return;
1478
2084
  }
1479
2085
  const debounceMs = this.config.configWatch?.debounceMs ?? 10000;
1480
- this.configWatcher = chokidar.watch(this.configPath, {
1481
- ignoreInitial: true,
1482
- });
1483
- this.configWatcher.on('change', () => {
1484
- if (this.configDebounce)
1485
- clearTimeout(this.configDebounce);
1486
- this.configDebounce = setTimeout(() => {
1487
- void this.reloadConfig();
1488
- }, debounceMs);
1489
- });
1490
- this.configWatcher.on('error', (error) => {
1491
- logger.error({ error }, 'Config watcher error');
2086
+ this.configWatcher = new ConfigWatcher({
2087
+ configPath: this.configPath,
2088
+ enabled,
2089
+ debounceMs,
2090
+ logger,
2091
+ onChange: async () => this.reloadConfig(),
1492
2092
  });
1493
- logger.info({ configPath: this.configPath, debounceMs }, 'Config watcher started');
2093
+ this.configWatcher.start();
1494
2094
  }
1495
2095
  async stopConfigWatch() {
1496
- if (this.configDebounce) {
1497
- clearTimeout(this.configDebounce);
1498
- this.configDebounce = undefined;
1499
- }
1500
2096
  if (this.configWatcher) {
1501
- await this.configWatcher.close();
2097
+ await this.configWatcher.stop();
1502
2098
  this.configWatcher = undefined;
1503
2099
  }
1504
2100
  }
@@ -1507,10 +2103,11 @@
1507
2103
  const processor = this.processor;
1508
2104
  if (!logger || !processor || !this.configPath)
1509
2105
  return;
2106
+ logger.info({ configPath: this.configPath }, 'Config change detected, reloading...');
1510
2107
  try {
1511
- const newConfig = await loadConfig(this.configPath);
2108
+ const newConfig = await this.factories.loadConfig(this.configPath);
1512
2109
  this.config = newConfig;
1513
- const compiledRules = compileRules(newConfig.inferenceRules ?? []);
2110
+ const compiledRules = this.factories.compileRules(newConfig.inferenceRules ?? []);
1514
2111
  processor.updateRules(compiledRules);
1515
2112
  logger.info({ configPath: this.configPath, rules: compiledRules.length }, 'Config reloaded');
1516
2113
  }
@@ -1528,12 +2125,7 @@
1528
2125
  async function startFromConfig(configPath) {
1529
2126
  const config = await loadConfig(configPath);
1530
2127
  const app = new JeevesWatcher(config, configPath);
1531
- const shutdown = async () => {
1532
- await app.stop();
1533
- process.exit(0);
1534
- };
1535
- process.on('SIGTERM', () => void shutdown());
1536
- process.on('SIGINT', () => void shutdown());
2128
+ installShutdownHandlers(() => app.stop());
1537
2129
  await app.start();
1538
2130
  return app;
1539
2131
  }
@@ -1543,20 +2135,28 @@
1543
2135
  exports.FileSystemWatcher = FileSystemWatcher;
1544
2136
  exports.JeevesWatcher = JeevesWatcher;
1545
2137
  exports.VectorStoreClient = VectorStoreClient;
2138
+ exports.apiConfigSchema = apiConfigSchema;
1546
2139
  exports.applyRules = applyRules;
1547
2140
  exports.buildAttributes = buildAttributes;
1548
2141
  exports.compileRules = compileRules;
2142
+ exports.configWatchConfigSchema = configWatchConfigSchema;
1549
2143
  exports.contentHash = contentHash;
1550
2144
  exports.createApiServer = createApiServer;
1551
2145
  exports.createEmbeddingProvider = createEmbeddingProvider;
1552
2146
  exports.createLogger = createLogger;
1553
2147
  exports.deleteMetadata = deleteMetadata;
2148
+ exports.embeddingConfigSchema = embeddingConfigSchema;
1554
2149
  exports.extractText = extractText;
2150
+ exports.inferenceRuleSchema = inferenceRuleSchema;
2151
+ exports.jeevesWatcherConfigSchema = jeevesWatcherConfigSchema;
1555
2152
  exports.loadConfig = loadConfig;
2153
+ exports.loggingConfigSchema = loggingConfigSchema;
1556
2154
  exports.metadataPath = metadataPath;
1557
2155
  exports.pointId = pointId;
1558
2156
  exports.readMetadata = readMetadata;
1559
2157
  exports.startFromConfig = startFromConfig;
2158
+ exports.vectorStoreConfigSchema = vectorStoreConfigSchema;
2159
+ exports.watchConfigSchema = watchConfigSchema;
1560
2160
  exports.writeMetadata = writeMetadata;
1561
2161
 
1562
- })(this["jeeves-watcher"] = this["jeeves-watcher"] || {}, Fastify, node_crypto, promises, node_path, picomatch, chokidar, Ajv, cosmiconfig, googleGenai, pino, textsplitters, cheerio, yaml, mammoth, uuid, addFormats, jsClientRest);
2162
+ })(this["jeeves-watcher"] = this["jeeves-watcher"] || {}, Fastify, promises, node_path, picomatch, radash, node_crypto, cosmiconfig, zod, jsonmap, googleGenai, pino, uuid, cheerio, yaml, mammoth, Ajv, addFormats, textsplitters, jsClientRest, chokidar);