@karmaniverous/jeeves-watcher 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -16
- package/config.schema.json +577 -0
- package/dist/cjs/index.js +800 -340
- package/dist/cli/jeeves-watcher/index.js +1130 -517
- package/dist/index.d.ts +160 -103
- package/dist/index.iife.js +796 -339
- package/dist/index.iife.min.js +1 -1
- package/dist/mjs/index.js +793 -341
- package/package.json +28 -22
|
@@ -1,24 +1,31 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
#!/usr/bin/env node
|
|
3
|
-
import { readFile, rm, mkdir, writeFile, readdir, stat } from 'node:fs/promises';
|
|
4
3
|
import { Command } from '@commander-js/extra-typings';
|
|
5
4
|
import chokidar from 'chokidar';
|
|
6
5
|
import Fastify from 'fastify';
|
|
6
|
+
import { omit, get } from 'radash';
|
|
7
7
|
import { createHash } from 'node:crypto';
|
|
8
|
+
import { rm, readFile, mkdir, writeFile, readdir, stat } from 'node:fs/promises';
|
|
8
9
|
import { dirname, join, resolve, extname, basename } from 'node:path';
|
|
9
10
|
import picomatch from 'picomatch';
|
|
10
|
-
import Ajv from 'ajv';
|
|
11
11
|
import { cosmiconfig } from 'cosmiconfig';
|
|
12
|
+
import { z, ZodError } from 'zod';
|
|
13
|
+
import { jsonMapMapSchema, JsonMap } from '@karmaniverous/jsonmap';
|
|
12
14
|
import { GoogleGenerativeAIEmbeddings } from '@langchain/google-genai';
|
|
13
15
|
import pino from 'pino';
|
|
14
|
-
import {
|
|
16
|
+
import { v5 } from 'uuid';
|
|
15
17
|
import * as cheerio from 'cheerio';
|
|
16
18
|
import yaml from 'js-yaml';
|
|
17
19
|
import mammoth from 'mammoth';
|
|
18
|
-
import
|
|
20
|
+
import Ajv from 'ajv';
|
|
19
21
|
import addFormats from 'ajv-formats';
|
|
22
|
+
import { MarkdownTextSplitter, RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
|
20
23
|
import { QdrantClient } from '@qdrant/js-client-rest';
|
|
21
24
|
|
|
25
|
+
/**
|
|
26
|
+
* @module metadata/metadata
|
|
27
|
+
* Persists file metadata as .meta.json. I/O: reads/writes/deletes metadata files under metadataDir. Path mapping via SHA-256 hash.
|
|
28
|
+
*/
|
|
22
29
|
/**
|
|
23
30
|
* Normalise a file path for deterministic mapping: lowercase, forward slashes, strip leading drive letter colon.
|
|
24
31
|
*
|
|
@@ -158,6 +165,30 @@ async function listFilesFromGlobs(patterns, ignored = []) {
|
|
|
158
165
|
return Array.from(seen);
|
|
159
166
|
}
|
|
160
167
|
|
|
168
|
+
/**
|
|
169
|
+
* @module processAllFiles
|
|
170
|
+
*
|
|
171
|
+
* Shared helper for processing all files matching configured globs.
|
|
172
|
+
*/
|
|
173
|
+
/**
|
|
174
|
+
* Process all files from globs using the specified processor method.
|
|
175
|
+
*
|
|
176
|
+
* @param watchPaths - The glob patterns to match.
|
|
177
|
+
* @param ignoredPaths - The glob patterns to ignore.
|
|
178
|
+
* @param processor - The document processor instance.
|
|
179
|
+
* @param method - The processor method to call ('processFile' or 'processRulesUpdate').
|
|
180
|
+
* @returns The number of files processed.
|
|
181
|
+
*/
|
|
182
|
+
async function processAllFiles(watchPaths, ignoredPaths, processor, method) {
|
|
183
|
+
const files = await listFilesFromGlobs(watchPaths, ignoredPaths);
|
|
184
|
+
for (const file of files) {
|
|
185
|
+
// Sequential on purpose to avoid surprising load.
|
|
186
|
+
// Queue integration can come later.
|
|
187
|
+
await processor[method](file);
|
|
188
|
+
}
|
|
189
|
+
return files.length;
|
|
190
|
+
}
|
|
191
|
+
|
|
161
192
|
/**
|
|
162
193
|
* Create the Fastify API server with all routes registered.
|
|
163
194
|
*
|
|
@@ -198,15 +229,8 @@ function createApiServer(options) {
|
|
|
198
229
|
});
|
|
199
230
|
app.post('/reindex', async (_request, reply) => {
|
|
200
231
|
try {
|
|
201
|
-
const
|
|
202
|
-
|
|
203
|
-
// Sequential on purpose to avoid surprising load.
|
|
204
|
-
// Queue integration can come later.
|
|
205
|
-
await processor.processFile(file);
|
|
206
|
-
}
|
|
207
|
-
return await reply
|
|
208
|
-
.status(200)
|
|
209
|
-
.send({ ok: true, filesIndexed: files.length });
|
|
232
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processFile');
|
|
233
|
+
return await reply.status(200).send({ ok: true, filesIndexed: count });
|
|
210
234
|
}
|
|
211
235
|
catch (error) {
|
|
212
236
|
logger.error({ error }, 'Reindex failed');
|
|
@@ -216,19 +240,21 @@ function createApiServer(options) {
|
|
|
216
240
|
app.post('/rebuild-metadata', async (_request, reply) => {
|
|
217
241
|
try {
|
|
218
242
|
const metadataDir = options.config.metadataDir ?? '.jeeves-metadata';
|
|
243
|
+
const SYSTEM_KEYS = [
|
|
244
|
+
'file_path',
|
|
245
|
+
'chunk_index',
|
|
246
|
+
'total_chunks',
|
|
247
|
+
'content_hash',
|
|
248
|
+
'chunk_text',
|
|
249
|
+
];
|
|
219
250
|
for await (const point of vectorStore.scroll()) {
|
|
220
251
|
const payload = point.payload;
|
|
221
252
|
const filePath = payload['file_path'];
|
|
222
253
|
if (typeof filePath !== 'string' || filePath.length === 0)
|
|
223
254
|
continue;
|
|
224
255
|
// Persist only enrichment-ish fields, not chunking/index fields.
|
|
225
|
-
const
|
|
226
|
-
|
|
227
|
-
delete rest.chunk_index;
|
|
228
|
-
delete rest.total_chunks;
|
|
229
|
-
delete rest.content_hash;
|
|
230
|
-
delete rest.chunk_text;
|
|
231
|
-
await writeMetadata(filePath, metadataDir, rest);
|
|
256
|
+
const enrichment = omit(payload, SYSTEM_KEYS);
|
|
257
|
+
await writeMetadata(filePath, metadataDir, enrichment);
|
|
232
258
|
}
|
|
233
259
|
return await reply.status(200).send({ ok: true });
|
|
234
260
|
}
|
|
@@ -245,20 +271,13 @@ function createApiServer(options) {
|
|
|
245
271
|
try {
|
|
246
272
|
if (scope === 'rules') {
|
|
247
273
|
// Re-apply inference rules to all files, update Qdrant payloads (no re-embedding)
|
|
248
|
-
const
|
|
249
|
-
|
|
250
|
-
// Use the new processRulesUpdate method
|
|
251
|
-
await processor.processRulesUpdate(file);
|
|
252
|
-
}
|
|
253
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (rules) completed');
|
|
274
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processRulesUpdate');
|
|
275
|
+
logger.info({ scope, filesProcessed: count }, 'Config reindex (rules) completed');
|
|
254
276
|
}
|
|
255
277
|
else {
|
|
256
278
|
// Full reindex: re-extract, re-embed, re-upsert
|
|
257
|
-
const
|
|
258
|
-
|
|
259
|
-
await processor.processFile(file);
|
|
260
|
-
}
|
|
261
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (full) completed');
|
|
279
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processFile');
|
|
280
|
+
logger.info({ scope, filesProcessed: count }, 'Config reindex (full) completed');
|
|
262
281
|
}
|
|
263
282
|
}
|
|
264
283
|
catch (error) {
|
|
@@ -275,117 +294,270 @@ function createApiServer(options) {
|
|
|
275
294
|
return app;
|
|
276
295
|
}
|
|
277
296
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
watch: {
|
|
285
|
-
type: 'object',
|
|
286
|
-
required: ['paths'],
|
|
287
|
-
properties: {
|
|
288
|
-
paths: { type: 'array', items: { type: 'string' }, minItems: 1 },
|
|
289
|
-
ignored: { type: 'array', items: { type: 'string' } },
|
|
290
|
-
pollIntervalMs: { type: 'number' },
|
|
291
|
-
usePolling: { type: 'boolean' },
|
|
292
|
-
debounceMs: { type: 'number' },
|
|
293
|
-
stabilityThresholdMs: { type: 'number' },
|
|
294
|
-
},
|
|
295
|
-
additionalProperties: false,
|
|
296
|
-
},
|
|
297
|
-
configWatch: {
|
|
298
|
-
type: 'object',
|
|
299
|
-
properties: {
|
|
300
|
-
enabled: { type: 'boolean' },
|
|
301
|
-
debounceMs: { type: 'number' },
|
|
302
|
-
},
|
|
303
|
-
additionalProperties: false,
|
|
304
|
-
},
|
|
305
|
-
embedding: {
|
|
306
|
-
type: 'object',
|
|
307
|
-
required: ['provider', 'model'],
|
|
308
|
-
properties: {
|
|
309
|
-
provider: { type: 'string' },
|
|
310
|
-
model: { type: 'string' },
|
|
311
|
-
chunkSize: { type: 'number' },
|
|
312
|
-
chunkOverlap: { type: 'number' },
|
|
313
|
-
dimensions: { type: 'number' },
|
|
314
|
-
apiKey: { type: 'string' },
|
|
315
|
-
rateLimitPerMinute: { type: 'number' },
|
|
316
|
-
concurrency: { type: 'number' },
|
|
317
|
-
},
|
|
318
|
-
additionalProperties: false,
|
|
319
|
-
},
|
|
320
|
-
vectorStore: {
|
|
321
|
-
type: 'object',
|
|
322
|
-
required: ['url', 'collectionName'],
|
|
323
|
-
properties: {
|
|
324
|
-
url: { type: 'string' },
|
|
325
|
-
collectionName: { type: 'string' },
|
|
326
|
-
apiKey: { type: 'string' },
|
|
327
|
-
},
|
|
328
|
-
additionalProperties: false,
|
|
329
|
-
},
|
|
330
|
-
metadataDir: { type: 'string' },
|
|
331
|
-
api: {
|
|
332
|
-
type: 'object',
|
|
333
|
-
properties: {
|
|
334
|
-
host: { type: 'string' },
|
|
335
|
-
port: { type: 'number' },
|
|
336
|
-
},
|
|
337
|
-
additionalProperties: false,
|
|
338
|
-
},
|
|
339
|
-
extractors: { type: 'object' },
|
|
340
|
-
inferenceRules: {
|
|
341
|
-
type: 'array',
|
|
342
|
-
items: {
|
|
343
|
-
type: 'object',
|
|
344
|
-
required: ['match', 'set'],
|
|
345
|
-
properties: {
|
|
346
|
-
match: { type: 'object' },
|
|
347
|
-
set: { type: 'object' },
|
|
348
|
-
},
|
|
349
|
-
additionalProperties: false,
|
|
350
|
-
},
|
|
351
|
-
},
|
|
352
|
-
logging: {
|
|
353
|
-
type: 'object',
|
|
354
|
-
properties: {
|
|
355
|
-
level: { type: 'string' },
|
|
356
|
-
file: { type: 'string' },
|
|
357
|
-
},
|
|
358
|
-
additionalProperties: false,
|
|
359
|
-
},
|
|
360
|
-
shutdownTimeoutMs: { type: 'number' },
|
|
361
|
-
},
|
|
362
|
-
additionalProperties: false,
|
|
363
|
-
};
|
|
364
|
-
const ajv = new Ajv({ allErrors: true });
|
|
365
|
-
const validate = ajv.compile(configSchema);
|
|
366
|
-
/** Default values for optional configuration fields. */
|
|
367
|
-
const DEFAULTS = {
|
|
368
|
-
configWatch: { enabled: true, debounceMs: 1000 },
|
|
297
|
+
/**
|
|
298
|
+
* @module config/defaults
|
|
299
|
+
* Default configuration values for jeeves-watcher. Pure data export, no I/O or side effects.
|
|
300
|
+
*/
|
|
301
|
+
/** Default root-level config values. */
|
|
302
|
+
const ROOT_DEFAULTS = {
|
|
369
303
|
metadataDir: '.jeeves-watcher',
|
|
370
|
-
api: { host: '127.0.0.1', port: 3100 },
|
|
371
|
-
logging: { level: 'info' },
|
|
372
304
|
shutdownTimeoutMs: 10000,
|
|
373
305
|
};
|
|
374
|
-
/** Default values
|
|
306
|
+
/** Default configWatch values. */
|
|
307
|
+
const CONFIG_WATCH_DEFAULTS = {
|
|
308
|
+
enabled: true,
|
|
309
|
+
debounceMs: 1000,
|
|
310
|
+
};
|
|
311
|
+
/** Default API values. */
|
|
312
|
+
const API_DEFAULTS = {
|
|
313
|
+
host: '127.0.0.1',
|
|
314
|
+
port: 3456,
|
|
315
|
+
};
|
|
316
|
+
/** Default logging values. */
|
|
317
|
+
const LOGGING_DEFAULTS = {
|
|
318
|
+
level: 'info',
|
|
319
|
+
};
|
|
320
|
+
/** Default watch configuration. */
|
|
375
321
|
const WATCH_DEFAULTS = {
|
|
376
322
|
debounceMs: 300,
|
|
377
323
|
stabilityThresholdMs: 500,
|
|
378
324
|
usePolling: false,
|
|
379
325
|
pollIntervalMs: 1000,
|
|
380
326
|
};
|
|
381
|
-
/** Default
|
|
327
|
+
/** Default embedding configuration. */
|
|
382
328
|
const EMBEDDING_DEFAULTS = {
|
|
383
329
|
chunkSize: 1000,
|
|
384
330
|
chunkOverlap: 200,
|
|
385
|
-
dimensions:
|
|
331
|
+
dimensions: 3072,
|
|
386
332
|
rateLimitPerMinute: 300,
|
|
387
333
|
concurrency: 5,
|
|
388
334
|
};
|
|
335
|
+
/** Default init command config template. */
|
|
336
|
+
const INIT_CONFIG_TEMPLATE = {
|
|
337
|
+
$schema: 'node_modules/@karmaniverous/jeeves-watcher/config.schema.json',
|
|
338
|
+
watch: {
|
|
339
|
+
paths: ['**/*.{md,markdown,txt,text,json,html,htm,pdf,docx}'],
|
|
340
|
+
ignored: ['**/node_modules/**', '**/.git/**', '**/.jeeves-watcher/**'],
|
|
341
|
+
},
|
|
342
|
+
configWatch: CONFIG_WATCH_DEFAULTS,
|
|
343
|
+
embedding: {
|
|
344
|
+
provider: 'gemini',
|
|
345
|
+
model: 'gemini-embedding-001',
|
|
346
|
+
dimensions: EMBEDDING_DEFAULTS.dimensions,
|
|
347
|
+
},
|
|
348
|
+
vectorStore: {
|
|
349
|
+
url: 'http://127.0.0.1:6333',
|
|
350
|
+
collectionName: 'jeeves-watcher',
|
|
351
|
+
},
|
|
352
|
+
metadataDir: ROOT_DEFAULTS.metadataDir,
|
|
353
|
+
api: API_DEFAULTS,
|
|
354
|
+
logging: LOGGING_DEFAULTS,
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Watch configuration for file system monitoring.
|
|
359
|
+
*/
|
|
360
|
+
const watchConfigSchema = z.object({
|
|
361
|
+
/** Glob patterns to watch. */
|
|
362
|
+
paths: z
|
|
363
|
+
.array(z.string())
|
|
364
|
+
.min(1)
|
|
365
|
+
.describe('Glob patterns for files to watch (e.g., "**/*.md"). At least one required.'),
|
|
366
|
+
/** Glob patterns to ignore. */
|
|
367
|
+
ignored: z
|
|
368
|
+
.array(z.string())
|
|
369
|
+
.optional()
|
|
370
|
+
.describe('Glob patterns to exclude from watching (e.g., "**/node_modules/**").'),
|
|
371
|
+
/** Polling interval in milliseconds. */
|
|
372
|
+
pollIntervalMs: z
|
|
373
|
+
.number()
|
|
374
|
+
.optional()
|
|
375
|
+
.describe('Polling interval in milliseconds when usePolling is enabled.'),
|
|
376
|
+
/** Whether to use polling instead of native watchers. */
|
|
377
|
+
usePolling: z
|
|
378
|
+
.boolean()
|
|
379
|
+
.optional()
|
|
380
|
+
.describe('Use polling instead of native file system events (for network drives).'),
|
|
381
|
+
/** Debounce delay in milliseconds for file change events. */
|
|
382
|
+
debounceMs: z
|
|
383
|
+
.number()
|
|
384
|
+
.optional()
|
|
385
|
+
.describe('Debounce delay in milliseconds for file change events.'),
|
|
386
|
+
/** Time in milliseconds a file must be stable before processing. */
|
|
387
|
+
stabilityThresholdMs: z
|
|
388
|
+
.number()
|
|
389
|
+
.optional()
|
|
390
|
+
.describe('Time in milliseconds a file must remain unchanged before processing.'),
|
|
391
|
+
});
|
|
392
|
+
/**
|
|
393
|
+
* Configuration watch settings.
|
|
394
|
+
*/
|
|
395
|
+
const configWatchConfigSchema = z.object({
|
|
396
|
+
/** Whether config file watching is enabled. */
|
|
397
|
+
enabled: z
|
|
398
|
+
.boolean()
|
|
399
|
+
.optional()
|
|
400
|
+
.describe('Enable automatic reloading when config file changes.'),
|
|
401
|
+
/** Debounce delay in milliseconds for config change events. */
|
|
402
|
+
debounceMs: z
|
|
403
|
+
.number()
|
|
404
|
+
.optional()
|
|
405
|
+
.describe('Debounce delay in milliseconds for config file change detection.'),
|
|
406
|
+
});
|
|
407
|
+
/**
|
|
408
|
+
* Embedding model configuration.
|
|
409
|
+
*/
|
|
410
|
+
const embeddingConfigSchema = z.object({
|
|
411
|
+
/** The embedding model provider. */
|
|
412
|
+
provider: z
|
|
413
|
+
.string()
|
|
414
|
+
.default('gemini')
|
|
415
|
+
.describe('Embedding provider name (e.g., "gemini", "openai").'),
|
|
416
|
+
/** The embedding model name. */
|
|
417
|
+
model: z
|
|
418
|
+
.string()
|
|
419
|
+
.default('gemini-embedding-001')
|
|
420
|
+
.describe('Embedding model identifier (e.g., "gemini-embedding-001", "text-embedding-3-small").'),
|
|
421
|
+
/** Maximum tokens per chunk for splitting. */
|
|
422
|
+
chunkSize: z
|
|
423
|
+
.number()
|
|
424
|
+
.optional()
|
|
425
|
+
.describe('Maximum chunk size in characters for text splitting.'),
|
|
426
|
+
/** Overlap between chunks in tokens. */
|
|
427
|
+
chunkOverlap: z
|
|
428
|
+
.number()
|
|
429
|
+
.optional()
|
|
430
|
+
.describe('Character overlap between consecutive chunks.'),
|
|
431
|
+
/** Embedding vector dimensions. */
|
|
432
|
+
dimensions: z
|
|
433
|
+
.number()
|
|
434
|
+
.optional()
|
|
435
|
+
.describe('Embedding vector dimensions (must match model output).'),
|
|
436
|
+
/** API key for the embedding provider. */
|
|
437
|
+
apiKey: z
|
|
438
|
+
.string()
|
|
439
|
+
.optional()
|
|
440
|
+
.describe('API key for embedding provider (supports ${ENV_VAR} substitution).'),
|
|
441
|
+
/** Maximum embedding requests per minute. */
|
|
442
|
+
rateLimitPerMinute: z
|
|
443
|
+
.number()
|
|
444
|
+
.optional()
|
|
445
|
+
.describe('Maximum embedding API requests per minute (rate limiting).'),
|
|
446
|
+
/** Maximum concurrent embedding requests. */
|
|
447
|
+
concurrency: z
|
|
448
|
+
.number()
|
|
449
|
+
.optional()
|
|
450
|
+
.describe('Maximum concurrent embedding requests.'),
|
|
451
|
+
});
|
|
452
|
+
/**
|
|
453
|
+
* Vector store configuration for Qdrant.
|
|
454
|
+
*/
|
|
455
|
+
const vectorStoreConfigSchema = z.object({
|
|
456
|
+
/** Qdrant server URL. */
|
|
457
|
+
url: z
|
|
458
|
+
.string()
|
|
459
|
+
.describe('Qdrant server URL (e.g., "http://localhost:6333").'),
|
|
460
|
+
/** Qdrant collection name. */
|
|
461
|
+
collectionName: z
|
|
462
|
+
.string()
|
|
463
|
+
.describe('Qdrant collection name for vector storage.'),
|
|
464
|
+
/** Qdrant API key. */
|
|
465
|
+
apiKey: z
|
|
466
|
+
.string()
|
|
467
|
+
.optional()
|
|
468
|
+
.describe('Qdrant API key for authentication (supports ${ENV_VAR} substitution).'),
|
|
469
|
+
});
|
|
470
|
+
/**
|
|
471
|
+
* API server configuration.
|
|
472
|
+
*/
|
|
473
|
+
const apiConfigSchema = z.object({
|
|
474
|
+
/** Host to bind to. */
|
|
475
|
+
host: z
|
|
476
|
+
.string()
|
|
477
|
+
.optional()
|
|
478
|
+
.describe('Host address for API server (e.g., "127.0.0.1", "0.0.0.0").'),
|
|
479
|
+
/** Port to listen on. */
|
|
480
|
+
port: z.number().optional().describe('Port for API server (e.g., 3456).'),
|
|
481
|
+
});
|
|
482
|
+
/**
|
|
483
|
+
* Logging configuration.
|
|
484
|
+
*/
|
|
485
|
+
const loggingConfigSchema = z.object({
|
|
486
|
+
/** Log level. */
|
|
487
|
+
level: z
|
|
488
|
+
.string()
|
|
489
|
+
.optional()
|
|
490
|
+
.describe('Logging level (trace, debug, info, warn, error, fatal).'),
|
|
491
|
+
/** Log file path. */
|
|
492
|
+
file: z
|
|
493
|
+
.string()
|
|
494
|
+
.optional()
|
|
495
|
+
.describe('Path to log file (logs to stdout if omitted).'),
|
|
496
|
+
});
|
|
497
|
+
/**
|
|
498
|
+
* An inference rule that enriches document metadata.
|
|
499
|
+
*/
|
|
500
|
+
const inferenceRuleSchema = z.object({
|
|
501
|
+
/** JSON Schema object to match against document metadata. */
|
|
502
|
+
match: z
|
|
503
|
+
.record(z.string(), z.unknown())
|
|
504
|
+
.describe('JSON Schema object to match against file attributes.'),
|
|
505
|
+
/** Metadata fields to set when the rule matches. */
|
|
506
|
+
set: z
|
|
507
|
+
.record(z.string(), z.unknown())
|
|
508
|
+
.describe('Metadata fields to set when match succeeds.'),
|
|
509
|
+
/** JsonMap transformation (inline or reference to named map). */
|
|
510
|
+
map: z
|
|
511
|
+
.union([jsonMapMapSchema, z.string()])
|
|
512
|
+
.optional()
|
|
513
|
+
.describe('JsonMap transformation (inline definition or named map reference).'),
|
|
514
|
+
});
|
|
515
|
+
/**
|
|
516
|
+
* Top-level configuration for jeeves-watcher.
|
|
517
|
+
*/
|
|
518
|
+
const jeevesWatcherConfigSchema = z.object({
|
|
519
|
+
/** File system watch configuration. */
|
|
520
|
+
watch: watchConfigSchema.describe('File system watch configuration.'),
|
|
521
|
+
/** Configuration file watch settings. */
|
|
522
|
+
configWatch: configWatchConfigSchema
|
|
523
|
+
.optional()
|
|
524
|
+
.describe('Configuration file watch settings.'),
|
|
525
|
+
/** Embedding model configuration. */
|
|
526
|
+
embedding: embeddingConfigSchema.describe('Embedding model configuration.'),
|
|
527
|
+
/** Vector store configuration. */
|
|
528
|
+
vectorStore: vectorStoreConfigSchema.describe('Qdrant vector store configuration.'),
|
|
529
|
+
/** Directory for persisted metadata. */
|
|
530
|
+
metadataDir: z
|
|
531
|
+
.string()
|
|
532
|
+
.optional()
|
|
533
|
+
.describe('Directory for persisted metadata sidecar files.'),
|
|
534
|
+
/** API server configuration. */
|
|
535
|
+
api: apiConfigSchema.optional().describe('API server configuration.'),
|
|
536
|
+
/** Extractor configurations keyed by name. */
|
|
537
|
+
extractors: z
|
|
538
|
+
.record(z.string(), z.unknown())
|
|
539
|
+
.optional()
|
|
540
|
+
.describe('Extractor configurations keyed by name.'),
|
|
541
|
+
/** Rules for inferring metadata from document properties. */
|
|
542
|
+
inferenceRules: z
|
|
543
|
+
.array(inferenceRuleSchema)
|
|
544
|
+
.optional()
|
|
545
|
+
.describe('Rules for inferring metadata from file attributes.'),
|
|
546
|
+
/** Reusable named JsonMap transformations. */
|
|
547
|
+
maps: z
|
|
548
|
+
.record(z.string(), jsonMapMapSchema)
|
|
549
|
+
.optional()
|
|
550
|
+
.describe('Reusable named JsonMap transformations.'),
|
|
551
|
+
/** Logging configuration. */
|
|
552
|
+
logging: loggingConfigSchema.optional().describe('Logging configuration.'),
|
|
553
|
+
/** Timeout in milliseconds for graceful shutdown. */
|
|
554
|
+
shutdownTimeoutMs: z
|
|
555
|
+
.number()
|
|
556
|
+
.optional()
|
|
557
|
+
.describe('Timeout in milliseconds for graceful shutdown.'),
|
|
558
|
+
});
|
|
559
|
+
|
|
560
|
+
const MODULE_NAME = 'jeeves-watcher';
|
|
389
561
|
/**
|
|
390
562
|
* Merge sensible defaults into a loaded configuration.
|
|
391
563
|
*
|
|
@@ -394,13 +566,13 @@ const EMBEDDING_DEFAULTS = {
|
|
|
394
566
|
*/
|
|
395
567
|
function applyDefaults(raw) {
|
|
396
568
|
return {
|
|
397
|
-
...
|
|
569
|
+
...ROOT_DEFAULTS,
|
|
398
570
|
...raw,
|
|
399
571
|
watch: { ...WATCH_DEFAULTS, ...raw.watch },
|
|
400
|
-
configWatch: { ...
|
|
572
|
+
configWatch: { ...CONFIG_WATCH_DEFAULTS, ...raw.configWatch },
|
|
401
573
|
embedding: { ...EMBEDDING_DEFAULTS, ...raw.embedding },
|
|
402
|
-
api: { ...
|
|
403
|
-
logging: { ...
|
|
574
|
+
api: { ...API_DEFAULTS, ...raw.api },
|
|
575
|
+
logging: { ...LOGGING_DEFAULTS, ...raw.logging },
|
|
404
576
|
};
|
|
405
577
|
}
|
|
406
578
|
/**
|
|
@@ -418,21 +590,89 @@ async function loadConfig(configPath) {
|
|
|
418
590
|
if (!result || result.isEmpty) {
|
|
419
591
|
throw new Error('No jeeves-watcher configuration found. Create a .jeeves-watcherrc or jeeves-watcher.config.{js,ts,json,yaml} file.');
|
|
420
592
|
}
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
593
|
+
try {
|
|
594
|
+
const validated = jeevesWatcherConfigSchema.parse(result.config);
|
|
595
|
+
return applyDefaults(validated);
|
|
596
|
+
}
|
|
597
|
+
catch (error) {
|
|
598
|
+
if (error instanceof ZodError) {
|
|
599
|
+
const errors = error.issues
|
|
600
|
+
.map((issue) => `${issue.path.join('.')}: ${issue.message}`)
|
|
601
|
+
.join('; ');
|
|
602
|
+
throw new Error(`Invalid jeeves-watcher configuration: ${errors}`);
|
|
603
|
+
}
|
|
604
|
+
throw error;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* @module util/retry
|
|
610
|
+
* Small async retry helper with exponential backoff. Side effects: sleeps between attempts; can invoke onRetry callback for logging.
|
|
611
|
+
*/
|
|
612
|
+
function sleep(ms, signal) {
|
|
613
|
+
if (ms <= 0)
|
|
614
|
+
return Promise.resolve();
|
|
615
|
+
return new Promise((resolve, reject) => {
|
|
616
|
+
const timer = setTimeout(() => {
|
|
617
|
+
cleanup();
|
|
618
|
+
resolve();
|
|
619
|
+
}, ms);
|
|
620
|
+
const onAbort = () => {
|
|
621
|
+
cleanup();
|
|
622
|
+
reject(new Error('Retry sleep aborted'));
|
|
623
|
+
};
|
|
624
|
+
const cleanup = () => {
|
|
625
|
+
clearTimeout(timer);
|
|
626
|
+
if (signal)
|
|
627
|
+
signal.removeEventListener('abort', onAbort);
|
|
628
|
+
};
|
|
629
|
+
if (signal) {
|
|
630
|
+
if (signal.aborted) {
|
|
631
|
+
onAbort();
|
|
632
|
+
return;
|
|
633
|
+
}
|
|
634
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
635
|
+
}
|
|
636
|
+
});
|
|
637
|
+
}
|
|
638
|
+
function computeDelayMs(attempt, baseDelayMs, maxDelayMs, jitter = 0) {
|
|
639
|
+
const exp = Math.max(0, attempt - 1);
|
|
640
|
+
const raw = Math.min(maxDelayMs, baseDelayMs * 2 ** exp);
|
|
641
|
+
const factor = jitter > 0 ? 1 + Math.random() * jitter : 1;
|
|
642
|
+
return Math.round(raw * factor);
|
|
643
|
+
}
|
|
644
|
+
/**
|
|
645
|
+
* Retry an async operation using exponential backoff.
|
|
646
|
+
*
|
|
647
|
+
* @param fn - Operation to execute.
|
|
648
|
+
* @param options - Retry policy.
|
|
649
|
+
* @returns The operation result.
|
|
650
|
+
*/
|
|
651
|
+
async function retry(fn, options) {
|
|
652
|
+
const attempts = Math.max(1, options.attempts);
|
|
653
|
+
let lastError;
|
|
654
|
+
for (let attempt = 1; attempt <= attempts; attempt++) {
|
|
655
|
+
try {
|
|
656
|
+
return await fn(attempt);
|
|
657
|
+
}
|
|
658
|
+
catch (error) {
|
|
659
|
+
lastError = error;
|
|
660
|
+
const isLast = attempt >= attempts;
|
|
661
|
+
if (isLast)
|
|
662
|
+
break;
|
|
663
|
+
const delayMs = computeDelayMs(attempt, options.baseDelayMs, options.maxDelayMs, options.jitter);
|
|
664
|
+
options.onRetry?.({ attempt, attempts, delayMs, error });
|
|
665
|
+
await sleep(delayMs, options.signal);
|
|
666
|
+
}
|
|
432
667
|
}
|
|
433
|
-
|
|
668
|
+
throw lastError;
|
|
434
669
|
}
|
|
435
670
|
|
|
671
|
+
/**
|
|
672
|
+
* @module embedding
|
|
673
|
+
*
|
|
674
|
+
* Embedding provider abstractions and registry-backed factory.
|
|
675
|
+
*/
|
|
436
676
|
/**
|
|
437
677
|
* Create a mock embedding provider that generates deterministic vectors from content hashes.
|
|
438
678
|
*
|
|
@@ -460,10 +700,11 @@ function createMockProvider(dimensions) {
|
|
|
460
700
|
* Create a Gemini embedding provider using the Google Generative AI SDK.
|
|
461
701
|
*
|
|
462
702
|
* @param config - The embedding configuration.
|
|
703
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
463
704
|
* @returns A Gemini {@link EmbeddingProvider}.
|
|
464
705
|
* @throws If the API key is missing.
|
|
465
706
|
*/
|
|
466
|
-
function createGeminiProvider(config) {
|
|
707
|
+
function createGeminiProvider(config, logger) {
|
|
467
708
|
if (!config.apiKey) {
|
|
468
709
|
throw new Error('Gemini embedding provider requires config.embedding.apiKey');
|
|
469
710
|
}
|
|
@@ -475,8 +716,43 @@ function createGeminiProvider(config) {
|
|
|
475
716
|
return {
|
|
476
717
|
dimensions,
|
|
477
718
|
async embed(texts) {
|
|
478
|
-
|
|
479
|
-
|
|
719
|
+
const vectors = await retry(async (attempt) => {
|
|
720
|
+
if (attempt > 1) {
|
|
721
|
+
const msg = {
|
|
722
|
+
attempt,
|
|
723
|
+
provider: 'gemini',
|
|
724
|
+
model: config.model,
|
|
725
|
+
};
|
|
726
|
+
if (logger) {
|
|
727
|
+
logger.warn(msg, 'Retrying embedding request');
|
|
728
|
+
}
|
|
729
|
+
else {
|
|
730
|
+
console.warn(msg, 'Retrying embedding request');
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
// embedDocuments returns vectors for multiple texts
|
|
734
|
+
return embedder.embedDocuments(texts);
|
|
735
|
+
}, {
|
|
736
|
+
attempts: 5,
|
|
737
|
+
baseDelayMs: 500,
|
|
738
|
+
maxDelayMs: 10_000,
|
|
739
|
+
jitter: 0.2,
|
|
740
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
741
|
+
const msg = {
|
|
742
|
+
attempt,
|
|
743
|
+
delayMs,
|
|
744
|
+
provider: 'gemini',
|
|
745
|
+
model: config.model,
|
|
746
|
+
error,
|
|
747
|
+
};
|
|
748
|
+
if (logger) {
|
|
749
|
+
logger.warn(msg, 'Embedding call failed; will retry');
|
|
750
|
+
}
|
|
751
|
+
else {
|
|
752
|
+
console.warn(msg, 'Embedding call failed; will retry');
|
|
753
|
+
}
|
|
754
|
+
},
|
|
755
|
+
});
|
|
480
756
|
// Validate dimensions
|
|
481
757
|
for (const vector of vectors) {
|
|
482
758
|
if (vector.length !== dimensions) {
|
|
@@ -487,25 +763,36 @@ function createGeminiProvider(config) {
|
|
|
487
763
|
},
|
|
488
764
|
};
|
|
489
765
|
}
|
|
766
|
+
function createMockFromConfig(config) {
|
|
767
|
+
const dimensions = config.dimensions ?? 768;
|
|
768
|
+
return createMockProvider(dimensions);
|
|
769
|
+
}
|
|
770
|
+
const embeddingProviderRegistry = new Map([
|
|
771
|
+
['mock', createMockFromConfig],
|
|
772
|
+
['gemini', createGeminiProvider],
|
|
773
|
+
]);
|
|
490
774
|
/**
|
|
491
775
|
* Create an embedding provider based on the given configuration.
|
|
492
776
|
*
|
|
777
|
+
* Each provider is responsible for its own default dimensions.
|
|
778
|
+
*
|
|
493
779
|
* @param config - The embedding configuration.
|
|
780
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
494
781
|
* @returns An {@link EmbeddingProvider} instance.
|
|
495
782
|
* @throws If the configured provider is not supported.
|
|
496
783
|
*/
|
|
497
|
-
function createEmbeddingProvider(config) {
|
|
498
|
-
const
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
return createMockProvider(dimensions);
|
|
502
|
-
case 'gemini':
|
|
503
|
-
return createGeminiProvider(config);
|
|
504
|
-
default:
|
|
505
|
-
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
784
|
+
function createEmbeddingProvider(config, logger) {
|
|
785
|
+
const factory = embeddingProviderRegistry.get(config.provider);
|
|
786
|
+
if (!factory) {
|
|
787
|
+
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
506
788
|
}
|
|
789
|
+
return factory(config, logger);
|
|
507
790
|
}
|
|
508
791
|
|
|
792
|
+
/**
|
|
793
|
+
* @module logger
|
|
794
|
+
* Creates pino logger instances. I/O: optionally writes logs to file via pino/file transport. Defaults to stdout at info level.
|
|
795
|
+
*/
|
|
509
796
|
/**
|
|
510
797
|
* Create a pino logger instance.
|
|
511
798
|
*
|
|
@@ -524,6 +811,54 @@ function createLogger(config) {
|
|
|
524
811
|
return pino({ level });
|
|
525
812
|
}
|
|
526
813
|
|
|
814
|
+
/**
|
|
815
|
+
* @module hash
|
|
816
|
+
* Provides SHA-256 content hashing. Pure function: given text string, returns hex digest. No I/O or side effects.
|
|
817
|
+
*/
|
|
818
|
+
/**
|
|
819
|
+
* Compute a SHA-256 hex digest of the given text.
|
|
820
|
+
*
|
|
821
|
+
* @param text - The input text to hash.
|
|
822
|
+
* @returns The hex-encoded SHA-256 hash.
|
|
823
|
+
*/
|
|
824
|
+
function contentHash(text) {
|
|
825
|
+
return createHash('sha256').update(text, 'utf8').digest('hex');
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
/**
|
|
829
|
+
* @module pointId
|
|
830
|
+
* Generates deterministic UUIDv5 point IDs for file paths and chunk indices. Pure function: normalizes paths, returns stable IDs. No I/O.
|
|
831
|
+
*/
|
|
832
|
+
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
833
|
+
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
834
|
+
/**
|
|
835
|
+
* Normalise a file path for deterministic point ID generation.
|
|
836
|
+
*
|
|
837
|
+
* @param filePath - The original file path.
|
|
838
|
+
* @returns The normalised path string.
|
|
839
|
+
*/
|
|
840
|
+
function normalisePath(filePath) {
|
|
841
|
+
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
842
|
+
}
|
|
843
|
+
/**
|
|
844
|
+
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
845
|
+
*
|
|
846
|
+
* @param filePath - The file path.
|
|
847
|
+
* @param chunkIndex - Optional chunk index within the file.
|
|
848
|
+
* @returns A deterministic UUID v5 string.
|
|
849
|
+
*/
|
|
850
|
+
function pointId(filePath, chunkIndex) {
|
|
851
|
+
const key = chunkIndex !== undefined
|
|
852
|
+
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
853
|
+
: normalisePath(filePath);
|
|
854
|
+
return v5(key, NAMESPACE);
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
/**
|
|
858
|
+
* @module extractors
|
|
859
|
+
*
|
|
860
|
+
* Text extraction registry for supported file formats.
|
|
861
|
+
*/
|
|
527
862
|
/**
|
|
528
863
|
* Extract YAML frontmatter from a Markdown document.
|
|
529
864
|
*
|
|
@@ -569,6 +904,55 @@ function extractJsonText(obj) {
|
|
|
569
904
|
}
|
|
570
905
|
return JSON.stringify(obj);
|
|
571
906
|
}
|
|
907
|
+
async function extractMarkdown(filePath) {
|
|
908
|
+
const raw = await readFile(filePath, 'utf8');
|
|
909
|
+
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
910
|
+
return { text: body, frontmatter };
|
|
911
|
+
}
|
|
912
|
+
async function extractPlaintext(filePath) {
|
|
913
|
+
const raw = await readFile(filePath, 'utf8');
|
|
914
|
+
return { text: raw };
|
|
915
|
+
}
|
|
916
|
+
async function extractJson(filePath) {
|
|
917
|
+
const raw = await readFile(filePath, 'utf8');
|
|
918
|
+
const parsed = JSON.parse(raw);
|
|
919
|
+
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
920
|
+
? parsed
|
|
921
|
+
: undefined;
|
|
922
|
+
return { text: extractJsonText(parsed), json };
|
|
923
|
+
}
|
|
924
|
+
async function extractPdf(filePath) {
|
|
925
|
+
const buffer = await readFile(filePath);
|
|
926
|
+
const uint8Array = new Uint8Array(buffer);
|
|
927
|
+
const { extractText: extractPdfText } = await import('unpdf');
|
|
928
|
+
const { text } = await extractPdfText(uint8Array);
|
|
929
|
+
// unpdf returns an array of strings (one per page)
|
|
930
|
+
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
931
|
+
return { text: content };
|
|
932
|
+
}
|
|
933
|
+
async function extractDocx(filePath) {
|
|
934
|
+
const buffer = await readFile(filePath);
|
|
935
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
936
|
+
return { text: result.value };
|
|
937
|
+
}
|
|
938
|
+
async function extractHtml(filePath) {
|
|
939
|
+
const raw = await readFile(filePath, 'utf8');
|
|
940
|
+
const $ = cheerio.load(raw);
|
|
941
|
+
$('script, style').remove();
|
|
942
|
+
const text = $('body').text().trim() || $.text().trim();
|
|
943
|
+
return { text };
|
|
944
|
+
}
|
|
945
|
+
const extractorRegistry = new Map([
|
|
946
|
+
['.md', extractMarkdown],
|
|
947
|
+
['.markdown', extractMarkdown],
|
|
948
|
+
['.txt', extractPlaintext],
|
|
949
|
+
['.text', extractPlaintext],
|
|
950
|
+
['.json', extractJson],
|
|
951
|
+
['.pdf', extractPdf],
|
|
952
|
+
['.docx', extractDocx],
|
|
953
|
+
['.html', extractHtml],
|
|
954
|
+
['.htm', extractHtml],
|
|
955
|
+
]);
|
|
572
956
|
/**
|
|
573
957
|
* Extract text from a file based on extension.
|
|
574
958
|
*
|
|
@@ -577,85 +961,11 @@ function extractJsonText(obj) {
|
|
|
577
961
|
* @returns Extracted text and optional structured data.
|
|
578
962
|
*/
|
|
579
963
|
async function extractText(filePath, extension) {
|
|
580
|
-
const
|
|
581
|
-
if (
|
|
582
|
-
|
|
583
|
-
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
584
|
-
return { text: body, frontmatter };
|
|
585
|
-
}
|
|
586
|
-
if (ext === '.txt' || ext === '.text') {
|
|
587
|
-
const raw = await readFile(filePath, 'utf8');
|
|
588
|
-
return { text: raw };
|
|
589
|
-
}
|
|
590
|
-
if (ext === '.json') {
|
|
591
|
-
const raw = await readFile(filePath, 'utf8');
|
|
592
|
-
const parsed = JSON.parse(raw);
|
|
593
|
-
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
594
|
-
? parsed
|
|
595
|
-
: undefined;
|
|
596
|
-
return { text: extractJsonText(parsed), json };
|
|
597
|
-
}
|
|
598
|
-
if (ext === '.pdf') {
|
|
599
|
-
const buffer = await readFile(filePath);
|
|
600
|
-
const uint8Array = new Uint8Array(buffer);
|
|
601
|
-
const { extractText: extractPdfText } = await import('unpdf');
|
|
602
|
-
const { text } = await extractPdfText(uint8Array);
|
|
603
|
-
// unpdf returns an array of strings (one per page)
|
|
604
|
-
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
605
|
-
return { text: content };
|
|
606
|
-
}
|
|
607
|
-
if (ext === '.docx') {
|
|
608
|
-
const buffer = await readFile(filePath);
|
|
609
|
-
const result = await mammoth.extractRawText({ buffer });
|
|
610
|
-
return { text: result.value };
|
|
611
|
-
}
|
|
612
|
-
if (ext === '.html' || ext === '.htm') {
|
|
613
|
-
const raw = await readFile(filePath, 'utf8');
|
|
614
|
-
const $ = cheerio.load(raw);
|
|
615
|
-
// Remove script and style elements
|
|
616
|
-
$('script, style').remove();
|
|
617
|
-
// Extract text content
|
|
618
|
-
const text = $('body').text().trim() || $.text().trim();
|
|
619
|
-
return { text };
|
|
620
|
-
}
|
|
964
|
+
const extractor = extractorRegistry.get(extension.toLowerCase());
|
|
965
|
+
if (extractor)
|
|
966
|
+
return extractor(filePath);
|
|
621
967
|
// Default: treat as plaintext.
|
|
622
|
-
|
|
623
|
-
return { text: raw };
|
|
624
|
-
}
|
|
625
|
-
|
|
626
|
-
/**
|
|
627
|
-
* Compute a SHA-256 hex digest of the given text.
|
|
628
|
-
*
|
|
629
|
-
* @param text - The input text to hash.
|
|
630
|
-
* @returns The hex-encoded SHA-256 hash.
|
|
631
|
-
*/
|
|
632
|
-
function contentHash(text) {
|
|
633
|
-
return createHash('sha256').update(text, 'utf8').digest('hex');
|
|
634
|
-
}
|
|
635
|
-
|
|
636
|
-
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
637
|
-
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
638
|
-
/**
|
|
639
|
-
* Normalise a file path for deterministic point ID generation.
|
|
640
|
-
*
|
|
641
|
-
* @param filePath - The original file path.
|
|
642
|
-
* @returns The normalised path string.
|
|
643
|
-
*/
|
|
644
|
-
function normalisePath(filePath) {
|
|
645
|
-
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
646
|
-
}
|
|
647
|
-
/**
|
|
648
|
-
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
649
|
-
*
|
|
650
|
-
* @param filePath - The file path.
|
|
651
|
-
* @param chunkIndex - Optional chunk index within the file.
|
|
652
|
-
* @returns A deterministic UUID v5 string.
|
|
653
|
-
*/
|
|
654
|
-
function pointId(filePath, chunkIndex) {
|
|
655
|
-
const key = chunkIndex !== undefined
|
|
656
|
-
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
657
|
-
: normalisePath(filePath);
|
|
658
|
-
return v5(key, NAMESPACE);
|
|
968
|
+
return extractPlaintext(filePath);
|
|
659
969
|
}
|
|
660
970
|
|
|
661
971
|
/**
|
|
@@ -728,13 +1038,7 @@ function resolveTemplateVars(value, attributes) {
|
|
|
728
1038
|
if (typeof value !== 'string')
|
|
729
1039
|
return value;
|
|
730
1040
|
return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
|
|
731
|
-
const
|
|
732
|
-
let current = attributes;
|
|
733
|
-
for (const part of parts) {
|
|
734
|
-
if (current === null || current === undefined)
|
|
735
|
-
return '';
|
|
736
|
-
current = current[part];
|
|
737
|
-
}
|
|
1041
|
+
const current = get(attributes, varPath);
|
|
738
1042
|
if (current === null || current === undefined)
|
|
739
1043
|
return '';
|
|
740
1044
|
return typeof current === 'string' ? current : JSON.stringify(current);
|
|
@@ -754,25 +1058,170 @@ function resolveSet(setObj, attributes) {
|
|
|
754
1058
|
}
|
|
755
1059
|
return result;
|
|
756
1060
|
}
|
|
1061
|
+
/**
|
|
1062
|
+
* Create the lib object for JsonMap transformations.
|
|
1063
|
+
* Provides utility functions for path manipulation.
|
|
1064
|
+
*
|
|
1065
|
+
* @returns The lib object.
|
|
1066
|
+
*/
|
|
1067
|
+
function createJsonMapLib() {
|
|
1068
|
+
return {
|
|
1069
|
+
split: (str, separator) => str.split(separator),
|
|
1070
|
+
slice: (arr, start, end) => arr.slice(start, end),
|
|
1071
|
+
join: (arr, separator) => arr.join(separator),
|
|
1072
|
+
toLowerCase: (str) => str.toLowerCase(),
|
|
1073
|
+
replace: (str, search, replacement) => str.replace(search, replacement),
|
|
1074
|
+
get: (obj, path) => get(obj, path),
|
|
1075
|
+
};
|
|
1076
|
+
}
|
|
757
1077
|
/**
|
|
758
1078
|
* Apply compiled inference rules to file attributes, returning merged metadata.
|
|
759
1079
|
*
|
|
760
1080
|
* Rules are evaluated in order; later rules override earlier ones.
|
|
1081
|
+
* If a rule has a `map`, the JsonMap transformation is applied after `set` resolution,
|
|
1082
|
+
* and map output overrides set output on conflict.
|
|
761
1083
|
*
|
|
762
1084
|
* @param compiledRules - The compiled rules to evaluate.
|
|
763
1085
|
* @param attributes - The file attributes to match against.
|
|
1086
|
+
* @param namedMaps - Optional record of named JsonMap definitions.
|
|
1087
|
+
* @param logger - Optional pino logger for warnings (falls back to console.warn).
|
|
764
1088
|
* @returns The merged metadata from all matching rules.
|
|
765
1089
|
*/
|
|
766
|
-
function applyRules(compiledRules, attributes) {
|
|
1090
|
+
async function applyRules(compiledRules, attributes, namedMaps, logger) {
|
|
1091
|
+
// JsonMap's type definitions expect a generic JsonMapLib shape with unary functions.
|
|
1092
|
+
// Our helper functions accept multiple args, which JsonMap supports at runtime.
|
|
1093
|
+
const lib = createJsonMapLib();
|
|
767
1094
|
let merged = {};
|
|
1095
|
+
const log = logger ?? console;
|
|
768
1096
|
for (const { rule, validate } of compiledRules) {
|
|
769
1097
|
if (validate(attributes)) {
|
|
770
|
-
|
|
1098
|
+
// Apply set resolution
|
|
1099
|
+
const setOutput = resolveSet(rule.set, attributes);
|
|
1100
|
+
merged = { ...merged, ...setOutput };
|
|
1101
|
+
// Apply map transformation if present
|
|
1102
|
+
if (rule.map) {
|
|
1103
|
+
let mapDef;
|
|
1104
|
+
// Resolve map reference
|
|
1105
|
+
if (typeof rule.map === 'string') {
|
|
1106
|
+
mapDef = namedMaps?.[rule.map];
|
|
1107
|
+
if (!mapDef) {
|
|
1108
|
+
log.warn(`Map reference "${rule.map}" not found in named maps. Skipping map transformation.`);
|
|
1109
|
+
continue;
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
else {
|
|
1113
|
+
mapDef = rule.map;
|
|
1114
|
+
}
|
|
1115
|
+
// Execute JsonMap transformation
|
|
1116
|
+
try {
|
|
1117
|
+
const jsonMap = new JsonMap(mapDef, lib);
|
|
1118
|
+
const mapOutput = await jsonMap.transform(attributes);
|
|
1119
|
+
if (mapOutput &&
|
|
1120
|
+
typeof mapOutput === 'object' &&
|
|
1121
|
+
!Array.isArray(mapOutput)) {
|
|
1122
|
+
merged = { ...merged, ...mapOutput };
|
|
1123
|
+
}
|
|
1124
|
+
else {
|
|
1125
|
+
log.warn(`JsonMap transformation did not return an object; skipping merge.`);
|
|
1126
|
+
}
|
|
1127
|
+
}
|
|
1128
|
+
catch (error) {
|
|
1129
|
+
log.warn(`JsonMap transformation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
771
1132
|
}
|
|
772
1133
|
}
|
|
773
1134
|
return merged;
|
|
774
1135
|
}
|
|
775
1136
|
|
|
1137
|
+
/**
|
|
1138
|
+
* @module processor/buildMetadata
|
|
1139
|
+
* Builds merged metadata from file content, inference rules, and enrichment. I/O: reads files, extracts text, loads enrichment .meta.json.
|
|
1140
|
+
*/
|
|
1141
|
+
/**
|
|
1142
|
+
* Build merged metadata for a file by applying inference rules and merging with enrichment metadata.
|
|
1143
|
+
*
|
|
1144
|
+
* @param filePath - The file to process.
|
|
1145
|
+
* @param compiledRules - The compiled inference rules.
|
|
1146
|
+
* @param metadataDir - The metadata directory for enrichment files.
|
|
1147
|
+
* @param maps - Optional named JsonMap definitions.
|
|
1148
|
+
* @param logger - Optional logger for rule warnings.
|
|
1149
|
+
* @returns The merged metadata and intermediate data.
|
|
1150
|
+
*/
|
|
1151
|
+
async function buildMergedMetadata(filePath, compiledRules, metadataDir, maps, logger) {
|
|
1152
|
+
const ext = extname(filePath);
|
|
1153
|
+
const stats = await stat(filePath);
|
|
1154
|
+
// 1. Extract text and structured data
|
|
1155
|
+
const extracted = await extractText(filePath, ext);
|
|
1156
|
+
// 2. Build attributes + apply rules
|
|
1157
|
+
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
1158
|
+
const inferred = await applyRules(compiledRules, attributes, maps, logger);
|
|
1159
|
+
// 3. Read enrichment metadata (merge, enrichment wins)
|
|
1160
|
+
const enrichment = await readMetadata(filePath, metadataDir);
|
|
1161
|
+
const metadata = {
|
|
1162
|
+
...inferred,
|
|
1163
|
+
...(enrichment ?? {}),
|
|
1164
|
+
};
|
|
1165
|
+
return { inferred, enrichment, metadata, attributes, extracted };
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
/**
|
|
1169
|
+
* @module processor/chunkIds
|
|
1170
|
+
* Generates chunk point IDs from file paths and chunk indices. Extracts chunk counts from Qdrant payloads. Pure functions, no I/O.
|
|
1171
|
+
*/
|
|
1172
|
+
/**
|
|
1173
|
+
* Generate an array of chunk IDs for a file.
|
|
1174
|
+
*
|
|
1175
|
+
* @param filePath - The file path.
|
|
1176
|
+
* @param totalChunks - The total number of chunks.
|
|
1177
|
+
* @returns An array of point IDs for each chunk.
|
|
1178
|
+
*/
|
|
1179
|
+
function chunkIds(filePath, totalChunks) {
|
|
1180
|
+
const ids = [];
|
|
1181
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
1182
|
+
ids.push(pointId(filePath, i));
|
|
1183
|
+
}
|
|
1184
|
+
return ids;
|
|
1185
|
+
}
|
|
1186
|
+
/**
|
|
1187
|
+
* Extract the total chunk count from a payload, with a fallback.
|
|
1188
|
+
*
|
|
1189
|
+
* @param payload - The Qdrant point payload (or null).
|
|
1190
|
+
* @param fallback - The fallback value if total_chunks is missing or invalid.
|
|
1191
|
+
* @returns The total chunk count.
|
|
1192
|
+
*/
|
|
1193
|
+
function getChunkCount(payload, fallback = 1) {
|
|
1194
|
+
if (!payload)
|
|
1195
|
+
return fallback;
|
|
1196
|
+
const count = payload['total_chunks'];
|
|
1197
|
+
return typeof count === 'number' ? count : fallback;
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
/**
|
|
1201
|
+
* @module processor/splitter
|
|
1202
|
+
* Factory for LangChain text splitters. Returns MarkdownTextSplitter or RecursiveCharacterTextSplitter based on file extension. No I/O.
|
|
1203
|
+
*/
|
|
1204
|
+
/**
|
|
1205
|
+
* Create the appropriate text splitter for the given file extension.
|
|
1206
|
+
*
|
|
1207
|
+
* @param ext - File extension (including leading dot).
|
|
1208
|
+
* @param chunkSize - Maximum chunk size in characters.
|
|
1209
|
+
* @param chunkOverlap - Overlap between chunks in characters.
|
|
1210
|
+
* @returns A text splitter instance.
|
|
1211
|
+
*/
|
|
1212
|
+
function createSplitter(ext, chunkSize, chunkOverlap) {
|
|
1213
|
+
const lowerExt = ext.toLowerCase();
|
|
1214
|
+
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1215
|
+
return new MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1216
|
+
}
|
|
1217
|
+
return new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
/**
|
|
1221
|
+
* @module processor
|
|
1222
|
+
*
|
|
1223
|
+
* Core document processing pipeline. Handles extracting text, computing embeddings, syncing with vector store.
|
|
1224
|
+
*/
|
|
776
1225
|
/**
|
|
777
1226
|
* Core document processing pipeline.
|
|
778
1227
|
*
|
|
@@ -784,11 +1233,10 @@ class DocumentProcessor {
|
|
|
784
1233
|
vectorStore;
|
|
785
1234
|
compiledRules;
|
|
786
1235
|
logger;
|
|
787
|
-
metadataDir;
|
|
788
1236
|
/**
|
|
789
1237
|
* Create a new DocumentProcessor.
|
|
790
1238
|
*
|
|
791
|
-
* @param config - The
|
|
1239
|
+
* @param config - The processor configuration.
|
|
792
1240
|
* @param embeddingProvider - The embedding provider.
|
|
793
1241
|
* @param vectorStore - The vector store client.
|
|
794
1242
|
* @param compiledRules - The compiled inference rules.
|
|
@@ -800,7 +1248,6 @@ class DocumentProcessor {
|
|
|
800
1248
|
this.vectorStore = vectorStore;
|
|
801
1249
|
this.compiledRules = compiledRules;
|
|
802
1250
|
this.logger = logger;
|
|
803
|
-
this.metadataDir = config.metadataDir ?? '.jeeves-metadata';
|
|
804
1251
|
}
|
|
805
1252
|
/**
|
|
806
1253
|
* Process a file through the full pipeline: extract, hash, chunk, embed, upsert.
|
|
@@ -810,9 +1257,8 @@ class DocumentProcessor {
|
|
|
810
1257
|
async processFile(filePath) {
|
|
811
1258
|
try {
|
|
812
1259
|
const ext = extname(filePath);
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
const extracted = await extractText(filePath, ext);
|
|
1260
|
+
// 1. Build merged metadata + extract text
|
|
1261
|
+
const { metadata, extracted } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
816
1262
|
if (!extracted.text.trim()) {
|
|
817
1263
|
this.logger.debug({ filePath }, 'Skipping empty file');
|
|
818
1264
|
return;
|
|
@@ -825,26 +1271,15 @@ class DocumentProcessor {
|
|
|
825
1271
|
this.logger.debug({ filePath }, 'Content unchanged, skipping');
|
|
826
1272
|
return;
|
|
827
1273
|
}
|
|
828
|
-
const oldTotalChunks =
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
const
|
|
833
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
834
|
-
// 4. Read enrichment metadata (merge, enrichment wins)
|
|
835
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
836
|
-
const metadata = {
|
|
837
|
-
...inferred,
|
|
838
|
-
...(enrichment ?? {}),
|
|
839
|
-
};
|
|
840
|
-
// 5. Chunk text
|
|
841
|
-
const chunkSize = this.config.embedding.chunkSize ?? 1000;
|
|
842
|
-
const chunkOverlap = this.config.embedding.chunkOverlap ?? 200;
|
|
843
|
-
const splitter = this.createSplitter(ext, chunkSize, chunkOverlap);
|
|
1274
|
+
const oldTotalChunks = getChunkCount(existingPayload);
|
|
1275
|
+
// 3. Chunk text
|
|
1276
|
+
const chunkSize = this.config.chunkSize ?? 1000;
|
|
1277
|
+
const chunkOverlap = this.config.chunkOverlap ?? 200;
|
|
1278
|
+
const splitter = createSplitter(ext, chunkSize, chunkOverlap);
|
|
844
1279
|
const chunks = await splitter.splitText(extracted.text);
|
|
845
|
-
//
|
|
1280
|
+
// 4. Embed all chunks
|
|
846
1281
|
const vectors = await this.embeddingProvider.embed(chunks);
|
|
847
|
-
//
|
|
1282
|
+
// 5. Upsert all chunk points
|
|
848
1283
|
const points = chunks.map((chunk, i) => ({
|
|
849
1284
|
id: pointId(filePath, i),
|
|
850
1285
|
vector: vectors[i],
|
|
@@ -858,12 +1293,9 @@ class DocumentProcessor {
|
|
|
858
1293
|
},
|
|
859
1294
|
}));
|
|
860
1295
|
await this.vectorStore.upsert(points);
|
|
861
|
-
//
|
|
1296
|
+
// 6. Clean up orphaned chunks
|
|
862
1297
|
if (oldTotalChunks > chunks.length) {
|
|
863
|
-
const orphanIds =
|
|
864
|
-
for (let i = chunks.length; i < oldTotalChunks; i++) {
|
|
865
|
-
orphanIds.push(pointId(filePath, i));
|
|
866
|
-
}
|
|
1298
|
+
const orphanIds = chunkIds(filePath, oldTotalChunks).slice(chunks.length);
|
|
867
1299
|
await this.vectorStore.delete(orphanIds);
|
|
868
1300
|
}
|
|
869
1301
|
this.logger.info({ filePath, chunks: chunks.length }, 'File processed successfully');
|
|
@@ -882,15 +1314,10 @@ class DocumentProcessor {
|
|
|
882
1314
|
// Get the existing payload to find total chunks
|
|
883
1315
|
const baseId = pointId(filePath, 0);
|
|
884
1316
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
885
|
-
const totalChunks =
|
|
886
|
-
|
|
887
|
-
: 1;
|
|
888
|
-
const ids = [];
|
|
889
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
890
|
-
ids.push(pointId(filePath, i));
|
|
891
|
-
}
|
|
1317
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1318
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
892
1319
|
await this.vectorStore.delete(ids);
|
|
893
|
-
await deleteMetadata(filePath, this.metadataDir);
|
|
1320
|
+
await deleteMetadata(filePath, this.config.metadataDir);
|
|
894
1321
|
this.logger.info({ filePath }, 'File deleted from index');
|
|
895
1322
|
}
|
|
896
1323
|
catch (error) {
|
|
@@ -907,21 +1334,16 @@ class DocumentProcessor {
|
|
|
907
1334
|
async processMetadataUpdate(filePath, metadata) {
|
|
908
1335
|
try {
|
|
909
1336
|
// Read existing enrichment metadata and merge
|
|
910
|
-
const existing = (await readMetadata(filePath, this.metadataDir)) ?? {};
|
|
1337
|
+
const existing = (await readMetadata(filePath, this.config.metadataDir)) ?? {};
|
|
911
1338
|
const merged = { ...existing, ...metadata };
|
|
912
|
-
await writeMetadata(filePath, this.metadataDir, merged);
|
|
1339
|
+
await writeMetadata(filePath, this.config.metadataDir, merged);
|
|
913
1340
|
// Update all chunk payloads in Qdrant
|
|
914
1341
|
const baseId = pointId(filePath, 0);
|
|
915
1342
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
916
1343
|
if (!existingPayload)
|
|
917
1344
|
return null;
|
|
918
|
-
const totalChunks =
|
|
919
|
-
|
|
920
|
-
: 1;
|
|
921
|
-
const ids = [];
|
|
922
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
923
|
-
ids.push(pointId(filePath, i));
|
|
924
|
-
}
|
|
1345
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1346
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
925
1347
|
await this.vectorStore.setPayload(ids, merged);
|
|
926
1348
|
this.logger.info({ filePath, chunks: totalChunks }, 'Metadata updated');
|
|
927
1349
|
return merged;
|
|
@@ -947,27 +1369,11 @@ class DocumentProcessor {
|
|
|
947
1369
|
this.logger.debug({ filePath }, 'File not indexed, skipping');
|
|
948
1370
|
return null;
|
|
949
1371
|
}
|
|
950
|
-
|
|
951
|
-
const
|
|
952
|
-
// Extract frontmatter/json for attribute building (lightweight)
|
|
953
|
-
const extracted = await extractText(filePath, ext);
|
|
954
|
-
// Build attributes + apply current rules
|
|
955
|
-
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
956
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
957
|
-
// Read enrichment metadata (merge, enrichment wins)
|
|
958
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
959
|
-
const metadata = {
|
|
960
|
-
...inferred,
|
|
961
|
-
...(enrichment ?? {}),
|
|
962
|
-
};
|
|
1372
|
+
// Build merged metadata (lightweight — no embedding)
|
|
1373
|
+
const { metadata } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
963
1374
|
// Update all chunk payloads
|
|
964
|
-
const totalChunks =
|
|
965
|
-
|
|
966
|
-
: 1;
|
|
967
|
-
const ids = [];
|
|
968
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
969
|
-
ids.push(pointId(filePath, i));
|
|
970
|
-
}
|
|
1375
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1376
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
971
1377
|
await this.vectorStore.setPayload(ids, metadata);
|
|
972
1378
|
this.logger.info({ filePath, chunks: totalChunks }, 'Rules re-applied');
|
|
973
1379
|
return metadata;
|
|
@@ -986,23 +1392,12 @@ class DocumentProcessor {
|
|
|
986
1392
|
this.compiledRules = compiledRules;
|
|
987
1393
|
this.logger.info({ rules: compiledRules.length }, 'Inference rules updated');
|
|
988
1394
|
}
|
|
989
|
-
/**
|
|
990
|
-
* Create the appropriate text splitter for the given file extension.
|
|
991
|
-
*
|
|
992
|
-
* @param ext - File extension.
|
|
993
|
-
* @param chunkSize - Maximum chunk size in characters.
|
|
994
|
-
* @param chunkOverlap - Overlap between chunks in characters.
|
|
995
|
-
* @returns A text splitter instance.
|
|
996
|
-
*/
|
|
997
|
-
createSplitter(ext, chunkSize, chunkOverlap) {
|
|
998
|
-
const lowerExt = ext.toLowerCase();
|
|
999
|
-
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1000
|
-
return new MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1001
|
-
}
|
|
1002
|
-
return new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1003
|
-
}
|
|
1004
1395
|
}
|
|
1005
1396
|
|
|
1397
|
+
/**
|
|
1398
|
+
* @module queue
|
|
1399
|
+
* Debounced, rate-limited, concurrent event queue for file watchers. Manages priority queuing and async callbacks. No direct I/O; orchestrates processing.
|
|
1400
|
+
*/
|
|
1006
1401
|
/**
|
|
1007
1402
|
* A debounced, rate-limited, concurrent event queue.
|
|
1008
1403
|
*/
|
|
@@ -1151,19 +1546,23 @@ class VectorStoreClient {
|
|
|
1151
1546
|
client;
|
|
1152
1547
|
collectionName;
|
|
1153
1548
|
dims;
|
|
1549
|
+
logger;
|
|
1154
1550
|
/**
|
|
1155
1551
|
* Create a new VectorStoreClient.
|
|
1156
1552
|
*
|
|
1157
1553
|
* @param config - Vector store configuration.
|
|
1158
1554
|
* @param dimensions - The embedding vector dimensions.
|
|
1555
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
1159
1556
|
*/
|
|
1160
|
-
constructor(config, dimensions) {
|
|
1557
|
+
constructor(config, dimensions, logger) {
|
|
1161
1558
|
this.client = new QdrantClient({
|
|
1162
1559
|
url: config.url,
|
|
1163
1560
|
apiKey: config.apiKey,
|
|
1561
|
+
checkCompatibility: false,
|
|
1164
1562
|
});
|
|
1165
1563
|
this.collectionName = config.collectionName;
|
|
1166
1564
|
this.dims = dimensions;
|
|
1565
|
+
this.logger = logger;
|
|
1167
1566
|
}
|
|
1168
1567
|
/**
|
|
1169
1568
|
* Ensure the collection exists with correct dimensions and Cosine distance.
|
|
@@ -1190,13 +1589,42 @@ class VectorStoreClient {
|
|
|
1190
1589
|
async upsert(points) {
|
|
1191
1590
|
if (points.length === 0)
|
|
1192
1591
|
return;
|
|
1193
|
-
await
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1592
|
+
await retry(async (attempt) => {
|
|
1593
|
+
if (attempt > 1) {
|
|
1594
|
+
const msg = {
|
|
1595
|
+
attempt,
|
|
1596
|
+
operation: 'qdrant.upsert',
|
|
1597
|
+
points: points.length,
|
|
1598
|
+
};
|
|
1599
|
+
if (this.logger) {
|
|
1600
|
+
this.logger.warn(msg, 'Retrying Qdrant upsert');
|
|
1601
|
+
}
|
|
1602
|
+
else {
|
|
1603
|
+
console.warn(msg, 'Retrying Qdrant upsert');
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
await this.client.upsert(this.collectionName, {
|
|
1607
|
+
wait: true,
|
|
1608
|
+
points: points.map((p) => ({
|
|
1609
|
+
id: p.id,
|
|
1610
|
+
vector: p.vector,
|
|
1611
|
+
payload: p.payload,
|
|
1612
|
+
})),
|
|
1613
|
+
});
|
|
1614
|
+
}, {
|
|
1615
|
+
attempts: 5,
|
|
1616
|
+
baseDelayMs: 500,
|
|
1617
|
+
maxDelayMs: 10_000,
|
|
1618
|
+
jitter: 0.2,
|
|
1619
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1620
|
+
const msg = { attempt, delayMs, operation: 'qdrant.upsert', error };
|
|
1621
|
+
if (this.logger) {
|
|
1622
|
+
this.logger.warn(msg, 'Qdrant upsert failed; will retry');
|
|
1623
|
+
}
|
|
1624
|
+
else {
|
|
1625
|
+
console.warn(msg, 'Qdrant upsert failed; will retry');
|
|
1626
|
+
}
|
|
1627
|
+
},
|
|
1200
1628
|
});
|
|
1201
1629
|
}
|
|
1202
1630
|
/**
|
|
@@ -1207,9 +1635,38 @@ class VectorStoreClient {
|
|
|
1207
1635
|
async delete(ids) {
|
|
1208
1636
|
if (ids.length === 0)
|
|
1209
1637
|
return;
|
|
1210
|
-
await
|
|
1211
|
-
|
|
1212
|
-
|
|
1638
|
+
await retry(async (attempt) => {
|
|
1639
|
+
if (attempt > 1) {
|
|
1640
|
+
const msg = {
|
|
1641
|
+
attempt,
|
|
1642
|
+
operation: 'qdrant.delete',
|
|
1643
|
+
ids: ids.length,
|
|
1644
|
+
};
|
|
1645
|
+
if (this.logger) {
|
|
1646
|
+
this.logger.warn(msg, 'Retrying Qdrant delete');
|
|
1647
|
+
}
|
|
1648
|
+
else {
|
|
1649
|
+
console.warn(msg, 'Retrying Qdrant delete');
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
await this.client.delete(this.collectionName, {
|
|
1653
|
+
wait: true,
|
|
1654
|
+
points: ids,
|
|
1655
|
+
});
|
|
1656
|
+
}, {
|
|
1657
|
+
attempts: 5,
|
|
1658
|
+
baseDelayMs: 500,
|
|
1659
|
+
maxDelayMs: 10_000,
|
|
1660
|
+
jitter: 0.2,
|
|
1661
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1662
|
+
const msg = { attempt, delayMs, operation: 'qdrant.delete', error };
|
|
1663
|
+
if (this.logger) {
|
|
1664
|
+
this.logger.warn(msg, 'Qdrant delete failed; will retry');
|
|
1665
|
+
}
|
|
1666
|
+
else {
|
|
1667
|
+
console.warn(msg, 'Qdrant delete failed; will retry');
|
|
1668
|
+
}
|
|
1669
|
+
},
|
|
1213
1670
|
});
|
|
1214
1671
|
}
|
|
1215
1672
|
/**
|
|
@@ -1309,6 +1766,10 @@ class VectorStoreClient {
|
|
|
1309
1766
|
}
|
|
1310
1767
|
}
|
|
1311
1768
|
|
|
1769
|
+
/**
|
|
1770
|
+
* @module watcher
|
|
1771
|
+
* Filesystem watcher wrapping chokidar. I/O: watches files/directories for add/change/unlink events, enqueues to processing queue.
|
|
1772
|
+
*/
|
|
1312
1773
|
/**
|
|
1313
1774
|
* Filesystem watcher that maps chokidar events to the processing queue.
|
|
1314
1775
|
*/
|
|
@@ -1406,16 +1867,22 @@ class JeevesWatcher {
|
|
|
1406
1867
|
this.logger = logger;
|
|
1407
1868
|
let embeddingProvider;
|
|
1408
1869
|
try {
|
|
1409
|
-
embeddingProvider = createEmbeddingProvider(this.config.embedding);
|
|
1870
|
+
embeddingProvider = createEmbeddingProvider(this.config.embedding, logger);
|
|
1410
1871
|
}
|
|
1411
1872
|
catch (error) {
|
|
1412
1873
|
logger.fatal({ error }, 'Failed to create embedding provider');
|
|
1413
1874
|
throw error;
|
|
1414
1875
|
}
|
|
1415
|
-
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions);
|
|
1876
|
+
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions, logger);
|
|
1416
1877
|
await vectorStore.ensureCollection();
|
|
1417
1878
|
const compiledRules = compileRules(this.config.inferenceRules ?? []);
|
|
1418
|
-
const
|
|
1879
|
+
const processorConfig = {
|
|
1880
|
+
metadataDir: this.config.metadataDir ?? '.jeeves-metadata',
|
|
1881
|
+
chunkSize: this.config.embedding.chunkSize,
|
|
1882
|
+
chunkOverlap: this.config.embedding.chunkOverlap,
|
|
1883
|
+
maps: this.config.maps,
|
|
1884
|
+
};
|
|
1885
|
+
const processor = new DocumentProcessor(processorConfig, embeddingProvider, vectorStore, compiledRules, logger);
|
|
1419
1886
|
this.processor = processor;
|
|
1420
1887
|
const queue = new EventQueue({
|
|
1421
1888
|
debounceMs: this.config.watch.debounceMs ?? 2000,
|
|
@@ -1436,7 +1903,7 @@ class JeevesWatcher {
|
|
|
1436
1903
|
this.server = server;
|
|
1437
1904
|
await server.listen({
|
|
1438
1905
|
host: this.config.api?.host ?? '127.0.0.1',
|
|
1439
|
-
port: this.config.api?.port ??
|
|
1906
|
+
port: this.config.api?.port ?? 3456,
|
|
1440
1907
|
});
|
|
1441
1908
|
watcher.start();
|
|
1442
1909
|
this.startConfigWatch();
|
|
@@ -1452,12 +1919,17 @@ class JeevesWatcher {
|
|
|
1452
1919
|
}
|
|
1453
1920
|
if (this.queue) {
|
|
1454
1921
|
const timeout = this.config.shutdownTimeoutMs ?? 10000;
|
|
1455
|
-
await Promise.race([
|
|
1456
|
-
this.queue.drain(),
|
|
1922
|
+
const drained = await Promise.race([
|
|
1923
|
+
this.queue.drain().then(() => true),
|
|
1457
1924
|
new Promise((resolve) => {
|
|
1458
|
-
setTimeout(
|
|
1925
|
+
setTimeout(() => {
|
|
1926
|
+
resolve(false);
|
|
1927
|
+
}, timeout);
|
|
1459
1928
|
}),
|
|
1460
1929
|
]);
|
|
1930
|
+
if (!drained) {
|
|
1931
|
+
this.logger?.warn({ timeoutMs: timeout }, 'Queue drain timeout hit, forcing shutdown');
|
|
1932
|
+
}
|
|
1461
1933
|
}
|
|
1462
1934
|
if (this.server) {
|
|
1463
1935
|
await this.server.close();
|
|
@@ -1506,6 +1978,7 @@ class JeevesWatcher {
|
|
|
1506
1978
|
const processor = this.processor;
|
|
1507
1979
|
if (!logger || !processor || !this.configPath)
|
|
1508
1980
|
return;
|
|
1981
|
+
logger.info({ configPath: this.configPath }, 'Config change detected, reloading...');
|
|
1509
1982
|
try {
|
|
1510
1983
|
const newConfig = await loadConfig(this.configPath);
|
|
1511
1984
|
this.config = newConfig;
|
|
@@ -1537,19 +2010,318 @@ async function startFromConfig(configPath) {
|
|
|
1537
2010
|
return app;
|
|
1538
2011
|
}
|
|
1539
2012
|
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
2013
|
+
/**
|
|
2014
|
+
* @module api
|
|
2015
|
+
*
|
|
2016
|
+
* Small fetch wrapper for jeeves-watcher CLI commands.
|
|
2017
|
+
*/
|
|
1544
2018
|
function apiBase(host, port) {
|
|
1545
2019
|
return `http://${host}:${port}`;
|
|
1546
2020
|
}
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
2021
|
+
/**
|
|
2022
|
+
* Call the jeeves-watcher HTTP API.
|
|
2023
|
+
*
|
|
2024
|
+
* @param host - API host.
|
|
2025
|
+
* @param port - API port.
|
|
2026
|
+
* @param method - HTTP method.
|
|
2027
|
+
* @param path - Request path.
|
|
2028
|
+
* @param body - Optional JSON body.
|
|
2029
|
+
* @returns Response body as text.
|
|
2030
|
+
*/
|
|
2031
|
+
async function apiCall(host, port, method, path, body) {
|
|
2032
|
+
const url = `${apiBase(host, port)}${path}`;
|
|
2033
|
+
const headers = {};
|
|
2034
|
+
const init = {
|
|
2035
|
+
method,
|
|
2036
|
+
headers,
|
|
2037
|
+
};
|
|
2038
|
+
if (body !== undefined) {
|
|
2039
|
+
headers['content-type'] = 'application/json';
|
|
2040
|
+
init.body = JSON.stringify(body);
|
|
2041
|
+
}
|
|
2042
|
+
const res = await fetch(url, init);
|
|
2043
|
+
const text = await res.text();
|
|
2044
|
+
if (!res.ok) {
|
|
2045
|
+
throw new Error(text || `HTTP ${String(res.status)}`);
|
|
2046
|
+
}
|
|
2047
|
+
return text;
|
|
2048
|
+
}
|
|
2049
|
+
|
|
2050
|
+
/**
|
|
2051
|
+
* @module commands/configReindex
|
|
2052
|
+
*
|
|
2053
|
+
* CLI command: config-reindex.
|
|
2054
|
+
*/
|
|
2055
|
+
function registerConfigReindexCommand(cli) {
|
|
2056
|
+
cli
|
|
2057
|
+
.command('config-reindex')
|
|
2058
|
+
.description('Reindex after configuration changes (POST /config-reindex)')
|
|
2059
|
+
.option('-s, --scope <scope>', 'Reindex scope (rules|full)', 'rules')
|
|
2060
|
+
.option('-p, --port <port>', 'API port', '3456')
|
|
2061
|
+
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
2062
|
+
.action(async (options) => {
|
|
2063
|
+
const scope = options.scope;
|
|
2064
|
+
if (scope !== 'rules' && scope !== 'full') {
|
|
2065
|
+
console.error('Invalid scope. Must be "rules" or "full"');
|
|
2066
|
+
process.exit(1);
|
|
2067
|
+
}
|
|
2068
|
+
try {
|
|
2069
|
+
const text = await apiCall(options.host, options.port, 'POST', '/config-reindex', { scope });
|
|
2070
|
+
console.log(text);
|
|
2071
|
+
}
|
|
2072
|
+
catch (error) {
|
|
2073
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
2074
|
+
process.exit(1);
|
|
2075
|
+
}
|
|
2076
|
+
});
|
|
2077
|
+
}
|
|
2078
|
+
|
|
2079
|
+
/**
|
|
2080
|
+
* @module commands/enrich
|
|
2081
|
+
*
|
|
2082
|
+
* CLI command: enrich.
|
|
2083
|
+
*/
|
|
2084
|
+
function registerEnrichCommand(cli) {
|
|
2085
|
+
cli
|
|
2086
|
+
.command('enrich')
|
|
2087
|
+
.description('Enrich document metadata (POST /metadata)')
|
|
2088
|
+
.argument('<path>', 'File path to enrich')
|
|
2089
|
+
.option('-k, --key <key=value...>', 'Metadata key-value pairs (repeatable)', [])
|
|
2090
|
+
.option('-j, --json <json>', 'Metadata as JSON string (e.g., \'{"key":"value"}\')')
|
|
2091
|
+
.option('-p, --port <port>', 'API port', '3456')
|
|
2092
|
+
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
2093
|
+
.action(async (path, options) => {
|
|
2094
|
+
try {
|
|
2095
|
+
let metadata = {};
|
|
2096
|
+
// Parse --json option
|
|
2097
|
+
if (options.json) {
|
|
2098
|
+
try {
|
|
2099
|
+
metadata = JSON.parse(options.json);
|
|
2100
|
+
}
|
|
2101
|
+
catch {
|
|
2102
|
+
console.error('Invalid JSON:', options.json);
|
|
2103
|
+
process.exit(1);
|
|
2104
|
+
}
|
|
2105
|
+
}
|
|
2106
|
+
// Parse --key options (key=value pairs)
|
|
2107
|
+
if (Array.isArray(options.key) && options.key.length > 0) {
|
|
2108
|
+
for (const pair of options.key) {
|
|
2109
|
+
const eqIndex = pair.indexOf('=');
|
|
2110
|
+
if (eqIndex === -1) {
|
|
2111
|
+
console.error(`Invalid key-value pair: ${pair}`);
|
|
2112
|
+
console.error('Expected format: key=value');
|
|
2113
|
+
process.exit(1);
|
|
2114
|
+
}
|
|
2115
|
+
const key = pair.slice(0, eqIndex);
|
|
2116
|
+
const value = pair.slice(eqIndex + 1);
|
|
2117
|
+
metadata[key] = value;
|
|
2118
|
+
}
|
|
2119
|
+
}
|
|
2120
|
+
if (Object.keys(metadata).length === 0) {
|
|
2121
|
+
console.error('No metadata provided. Use --key or --json.');
|
|
2122
|
+
process.exit(1);
|
|
2123
|
+
}
|
|
2124
|
+
const text = await apiCall(options.host, options.port, 'POST', '/metadata', {
|
|
2125
|
+
path,
|
|
2126
|
+
metadata,
|
|
2127
|
+
});
|
|
2128
|
+
try {
|
|
2129
|
+
const parsed = JSON.parse(text);
|
|
2130
|
+
console.log(JSON.stringify(parsed, null, 2));
|
|
2131
|
+
}
|
|
2132
|
+
catch {
|
|
2133
|
+
console.log(text);
|
|
2134
|
+
}
|
|
2135
|
+
}
|
|
2136
|
+
catch (error) {
|
|
2137
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
2138
|
+
process.exit(1);
|
|
2139
|
+
}
|
|
2140
|
+
});
|
|
2141
|
+
}
|
|
2142
|
+
|
|
2143
|
+
/**
|
|
2144
|
+
* @module commands/rebuildMetadata
|
|
2145
|
+
*
|
|
2146
|
+
* CLI command: rebuild-metadata.
|
|
2147
|
+
*/
|
|
2148
|
+
function registerRebuildMetadataCommand(cli) {
|
|
2149
|
+
cli
|
|
2150
|
+
.command('rebuild-metadata')
|
|
2151
|
+
.description('Rebuild metadata store from Qdrant (POST /rebuild-metadata)')
|
|
2152
|
+
.option('-p, --port <port>', 'API port', '3456')
|
|
2153
|
+
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
2154
|
+
.action(async (options) => {
|
|
2155
|
+
try {
|
|
2156
|
+
const text = await apiCall(options.host, options.port, 'POST', '/rebuild-metadata');
|
|
2157
|
+
console.log(text);
|
|
2158
|
+
}
|
|
2159
|
+
catch (error) {
|
|
2160
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
2161
|
+
process.exit(1);
|
|
2162
|
+
}
|
|
2163
|
+
});
|
|
2164
|
+
}
|
|
2165
|
+
|
|
2166
|
+
/**
|
|
2167
|
+
* @module commands/reindex
|
|
2168
|
+
*
|
|
2169
|
+
* CLI command: reindex.
|
|
2170
|
+
*/
|
|
2171
|
+
function registerReindexCommand(cli) {
|
|
2172
|
+
cli
|
|
2173
|
+
.command('reindex')
|
|
2174
|
+
.description('Reindex all watched files (POST /reindex)')
|
|
2175
|
+
.option('-p, --port <port>', 'API port', '3456')
|
|
2176
|
+
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
2177
|
+
.action(async (options) => {
|
|
2178
|
+
try {
|
|
2179
|
+
const text = await apiCall(options.host, options.port, 'POST', '/reindex');
|
|
2180
|
+
console.log(text);
|
|
2181
|
+
}
|
|
2182
|
+
catch (error) {
|
|
2183
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
2184
|
+
process.exit(1);
|
|
2185
|
+
}
|
|
2186
|
+
});
|
|
2187
|
+
}
|
|
2188
|
+
|
|
2189
|
+
/**
|
|
2190
|
+
* @module commands/search
|
|
2191
|
+
*
|
|
2192
|
+
* CLI command: search.
|
|
2193
|
+
*/
|
|
2194
|
+
function registerSearchCommand(cli) {
|
|
2195
|
+
cli
|
|
2196
|
+
.command('search')
|
|
2197
|
+
.description('Search the vector store (POST /search)')
|
|
2198
|
+
.argument('<query>', 'Search query')
|
|
2199
|
+
.option('-l, --limit <limit>', 'Max results', '10')
|
|
2200
|
+
.option('-p, --port <port>', 'API port', '3456')
|
|
2201
|
+
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
2202
|
+
.action(async (query, options) => {
|
|
2203
|
+
try {
|
|
2204
|
+
const text = await apiCall(options.host, options.port, 'POST', '/search', {
|
|
2205
|
+
query,
|
|
2206
|
+
limit: Number(options.limit),
|
|
2207
|
+
});
|
|
2208
|
+
try {
|
|
2209
|
+
const parsed = JSON.parse(text);
|
|
2210
|
+
console.log(JSON.stringify(parsed, null, 2));
|
|
2211
|
+
}
|
|
2212
|
+
catch {
|
|
2213
|
+
console.log(text);
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
catch (error) {
|
|
2217
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
2218
|
+
process.exit(1);
|
|
2219
|
+
}
|
|
2220
|
+
});
|
|
2221
|
+
}
|
|
2222
|
+
|
|
2223
|
+
/**
|
|
2224
|
+
* @module commands/service
|
|
2225
|
+
*
|
|
2226
|
+
* CLI command: service.
|
|
2227
|
+
*/
|
|
2228
|
+
function registerServiceCommand(cli) {
|
|
2229
|
+
cli
|
|
2230
|
+
.command('service')
|
|
2231
|
+
.description('Generate service install/uninstall instructions')
|
|
2232
|
+
.addCommand(new Command('install')
|
|
2233
|
+
.description('Print install instructions for a system service')
|
|
2234
|
+
.option('-c, --config <path>', 'Path to configuration file')
|
|
2235
|
+
.option('-n, --name <name>', 'Service name', 'jeeves-watcher')
|
|
2236
|
+
.action((options) => {
|
|
2237
|
+
const name = options.name;
|
|
2238
|
+
const configPath = options.config;
|
|
2239
|
+
if (process.platform === 'win32') {
|
|
2240
|
+
console.log('NSSM install (example):');
|
|
2241
|
+
console.log(` nssm install ${name} node "%CD%\\node_modules\\@karmaniverous\\jeeves-watcher\\dist\\cli\\jeeves-watcher\\index.js" start${configPath ? ` --config "${configPath}"` : ''}`);
|
|
2242
|
+
console.log(` nssm set ${name} AppDirectory "%CD%"`);
|
|
2243
|
+
console.log(` nssm set ${name} Start SERVICE_AUTO_START`);
|
|
2244
|
+
console.log(` nssm start ${name}`);
|
|
2245
|
+
return;
|
|
2246
|
+
}
|
|
2247
|
+
const unit = `[Unit]\nDescription=Jeeves Watcher\nAfter=network.target\n\n[Service]\nType=simple\nWorkingDirectory=%h\nExecStart=/usr/bin/env jeeves-watcher start${configPath ? ` --config ${configPath}` : ''}\nRestart=on-failure\n\n[Install]\nWantedBy=default.target\n`;
|
|
2248
|
+
console.log('# systemd unit file');
|
|
2249
|
+
console.log(`# ~/.config/systemd/user/${name}.service`);
|
|
2250
|
+
console.log(unit);
|
|
2251
|
+
console.log('# install');
|
|
2252
|
+
console.log(` systemctl --user daemon-reload`);
|
|
2253
|
+
console.log(` systemctl --user enable --now ${name}.service`);
|
|
2254
|
+
}))
|
|
2255
|
+
.addCommand(new Command('uninstall')
|
|
2256
|
+
.description('Print uninstall instructions for a system service')
|
|
2257
|
+
.option('-n, --name <name>', 'Service name', 'jeeves-watcher')
|
|
2258
|
+
.action((options) => {
|
|
2259
|
+
const name = options.name;
|
|
2260
|
+
if (process.platform === 'win32') {
|
|
2261
|
+
console.log('NSSM uninstall (example):');
|
|
2262
|
+
console.log(` nssm stop ${name}`);
|
|
2263
|
+
console.log(` nssm remove ${name} confirm`);
|
|
2264
|
+
return;
|
|
2265
|
+
}
|
|
2266
|
+
console.log('# systemd uninstall');
|
|
2267
|
+
console.log(` systemctl --user disable --now ${name}.service`);
|
|
2268
|
+
console.log(`# remove ~/.config/systemd/user/${name}.service`);
|
|
2269
|
+
console.log(` systemctl --user daemon-reload`);
|
|
2270
|
+
}));
|
|
2271
|
+
}
|
|
2272
|
+
|
|
2273
|
+
/**
|
|
2274
|
+
* @module commands/status
|
|
2275
|
+
*
|
|
2276
|
+
* CLI command: status.
|
|
2277
|
+
*/
|
|
2278
|
+
function registerStatusCommand(cli) {
|
|
2279
|
+
cli
|
|
2280
|
+
.command('status')
|
|
2281
|
+
.description('Show watcher status')
|
|
2282
|
+
.option('-p, --port <port>', 'API port', '3456')
|
|
2283
|
+
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
2284
|
+
.action(async (options) => {
|
|
2285
|
+
try {
|
|
2286
|
+
const text = await apiCall(options.host, options.port, 'GET', '/status');
|
|
2287
|
+
try {
|
|
2288
|
+
const parsed = JSON.parse(text);
|
|
2289
|
+
console.log(JSON.stringify(parsed, null, 2));
|
|
2290
|
+
}
|
|
2291
|
+
catch {
|
|
2292
|
+
console.log(text);
|
|
2293
|
+
}
|
|
2294
|
+
}
|
|
2295
|
+
catch {
|
|
2296
|
+
console.error('Could not connect to jeeves-watcher. Is it running?');
|
|
2297
|
+
process.exit(1);
|
|
2298
|
+
}
|
|
2299
|
+
});
|
|
2300
|
+
}
|
|
2301
|
+
|
|
2302
|
+
/**
|
|
2303
|
+
* @module cli/jeeves-watcher/writeJsonFile
|
|
2304
|
+
* Writes pretty-printed JSON files for CLI commands. I/O: writes JSON with stable formatting (2-space indent, trailing newline) to disk.
|
|
2305
|
+
*/
|
|
2306
|
+
/**
|
|
2307
|
+
* Write JSON to a file with stable formatting.
|
|
2308
|
+
*
|
|
2309
|
+
* @param path - Destination path.
|
|
2310
|
+
* @param data - JSON-serializable data.
|
|
2311
|
+
*/
|
|
1550
2312
|
async function writeJsonFile(path, data) {
|
|
1551
2313
|
await writeFile(path, `${JSON.stringify(data, null, 2)}\n`, 'utf8');
|
|
1552
2314
|
}
|
|
2315
|
+
|
|
2316
|
+
/**
|
|
2317
|
+
* @module cli/jeeves-watcher
|
|
2318
|
+
*
|
|
2319
|
+
* jeeves-watcher CLI entrypoint.
|
|
2320
|
+
*/
|
|
2321
|
+
const cli = new Command()
|
|
2322
|
+
.name('jeeves-watcher')
|
|
2323
|
+
.description('Filesystem watcher that keeps a Qdrant vector store in sync with document changes')
|
|
2324
|
+
.version('0.7.0');
|
|
1553
2325
|
cli
|
|
1554
2326
|
.command('start')
|
|
1555
2327
|
.description('Start the filesystem watcher')
|
|
@@ -1574,59 +2346,20 @@ cli
|
|
|
1574
2346
|
console.log(` Watch paths: ${config.watch.paths.join(', ')}`);
|
|
1575
2347
|
console.log(` Embedding: ${config.embedding.provider}/${config.embedding.model}`);
|
|
1576
2348
|
console.log(` Vector store: ${config.vectorStore.url} (${config.vectorStore.collectionName})`);
|
|
1577
|
-
console.log(` API: ${config.api?.host ?? '127.0.0.1'}:${String(config.api?.port ??
|
|
2349
|
+
console.log(` API: ${config.api?.host ?? '127.0.0.1'}:${String(config.api?.port ?? 3456)}`);
|
|
1578
2350
|
}
|
|
1579
2351
|
catch (error) {
|
|
1580
2352
|
console.error('Config invalid:', error);
|
|
1581
2353
|
process.exit(1);
|
|
1582
2354
|
}
|
|
1583
2355
|
});
|
|
1584
|
-
cli
|
|
1585
|
-
.command('status')
|
|
1586
|
-
.description('Show watcher status')
|
|
1587
|
-
.option('-p, --port <port>', 'API port', '3458')
|
|
1588
|
-
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
1589
|
-
.action(async (options) => {
|
|
1590
|
-
try {
|
|
1591
|
-
const url = `http://${options.host}:${options.port}/status`;
|
|
1592
|
-
const response = await fetch(url);
|
|
1593
|
-
const data = (await response.json());
|
|
1594
|
-
console.log(JSON.stringify(data, null, 2));
|
|
1595
|
-
}
|
|
1596
|
-
catch {
|
|
1597
|
-
console.error('Could not connect to jeeves-watcher. Is it running?');
|
|
1598
|
-
process.exit(1);
|
|
1599
|
-
}
|
|
1600
|
-
});
|
|
1601
2356
|
cli
|
|
1602
2357
|
.command('init')
|
|
1603
2358
|
.description('Initialize a new configuration (jeeves-watcher.config.json)')
|
|
1604
2359
|
.option('-o, --output <path>', 'Output config file path', 'jeeves-watcher.config.json')
|
|
1605
2360
|
.action(async (options) => {
|
|
1606
2361
|
try {
|
|
1607
|
-
|
|
1608
|
-
watch: {
|
|
1609
|
-
paths: ['**/*.{md,markdown,txt,text,json,html,htm,pdf,docx}'],
|
|
1610
|
-
ignored: [
|
|
1611
|
-
'**/node_modules/**',
|
|
1612
|
-
'**/.git/**',
|
|
1613
|
-
'**/.jeeves-watcher/**',
|
|
1614
|
-
],
|
|
1615
|
-
},
|
|
1616
|
-
configWatch: { enabled: true, debounceMs: 1000 },
|
|
1617
|
-
embedding: {
|
|
1618
|
-
provider: 'gemini',
|
|
1619
|
-
model: 'text-embedding-004',
|
|
1620
|
-
},
|
|
1621
|
-
vectorStore: {
|
|
1622
|
-
url: 'http://127.0.0.1:6333',
|
|
1623
|
-
collectionName: 'jeeves-watcher',
|
|
1624
|
-
},
|
|
1625
|
-
metadataDir: '.jeeves-watcher',
|
|
1626
|
-
api: { host: '127.0.0.1', port: 3100 },
|
|
1627
|
-
logging: { level: 'info' },
|
|
1628
|
-
};
|
|
1629
|
-
await writeJsonFile(options.output, config);
|
|
2362
|
+
await writeJsonFile(options.output, INIT_CONFIG_TEMPLATE);
|
|
1630
2363
|
console.log(`Wrote ${options.output}`);
|
|
1631
2364
|
}
|
|
1632
2365
|
catch (error) {
|
|
@@ -1634,132 +2367,12 @@ cli
|
|
|
1634
2367
|
process.exit(1);
|
|
1635
2368
|
}
|
|
1636
2369
|
});
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
const text = await res.text();
|
|
1646
|
-
if (!res.ok) {
|
|
1647
|
-
console.error(text);
|
|
1648
|
-
process.exit(1);
|
|
1649
|
-
}
|
|
1650
|
-
console.log(text);
|
|
1651
|
-
});
|
|
1652
|
-
cli
|
|
1653
|
-
.command('rebuild-metadata')
|
|
1654
|
-
.description('Rebuild metadata store from Qdrant (POST /rebuild-metadata)')
|
|
1655
|
-
.option('-p, --port <port>', 'API port', '3458')
|
|
1656
|
-
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
1657
|
-
.action(async (options) => {
|
|
1658
|
-
const url = `${apiBase(options.host, options.port)}/rebuild-metadata`;
|
|
1659
|
-
const res = await fetch(url, { method: 'POST' });
|
|
1660
|
-
const text = await res.text();
|
|
1661
|
-
if (!res.ok) {
|
|
1662
|
-
console.error(text);
|
|
1663
|
-
process.exit(1);
|
|
1664
|
-
}
|
|
1665
|
-
console.log(text);
|
|
1666
|
-
});
|
|
1667
|
-
cli
|
|
1668
|
-
.command('search')
|
|
1669
|
-
.description('Search the vector store (POST /search)')
|
|
1670
|
-
.argument('<query>', 'Search query')
|
|
1671
|
-
.option('-l, --limit <limit>', 'Max results', '10')
|
|
1672
|
-
.option('-p, --port <port>', 'API port', '3458')
|
|
1673
|
-
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
1674
|
-
.action(async (query, options) => {
|
|
1675
|
-
const url = `${apiBase(options.host, options.port)}/search`;
|
|
1676
|
-
const res = await fetch(url, {
|
|
1677
|
-
method: 'POST',
|
|
1678
|
-
headers: { 'content-type': 'application/json' },
|
|
1679
|
-
body: JSON.stringify({ query, limit: Number(options.limit) }),
|
|
1680
|
-
});
|
|
1681
|
-
const text = await res.text();
|
|
1682
|
-
if (!res.ok) {
|
|
1683
|
-
console.error(text);
|
|
1684
|
-
process.exit(1);
|
|
1685
|
-
}
|
|
1686
|
-
try {
|
|
1687
|
-
const parsed = JSON.parse(text);
|
|
1688
|
-
console.log(JSON.stringify(parsed, null, 2));
|
|
1689
|
-
}
|
|
1690
|
-
catch {
|
|
1691
|
-
console.log(text);
|
|
1692
|
-
}
|
|
1693
|
-
});
|
|
1694
|
-
cli
|
|
1695
|
-
.command('enrich')
|
|
1696
|
-
.description('Enrich document metadata')
|
|
1697
|
-
.action(stubAction('enrich'));
|
|
1698
|
-
cli
|
|
1699
|
-
.command('service')
|
|
1700
|
-
.description('Generate service install/uninstall instructions')
|
|
1701
|
-
.addCommand(new Command('install')
|
|
1702
|
-
.description('Print install instructions for a system service')
|
|
1703
|
-
.option('-c, --config <path>', 'Path to configuration file')
|
|
1704
|
-
.option('-n, --name <name>', 'Service name', 'jeeves-watcher')
|
|
1705
|
-
.action((options) => {
|
|
1706
|
-
const name = options.name;
|
|
1707
|
-
const configPath = options.config;
|
|
1708
|
-
if (process.platform === 'win32') {
|
|
1709
|
-
console.log('NSSM install (example):');
|
|
1710
|
-
console.log(` nssm install ${name} node "%CD%\\node_modules\\@karmaniverous\\jeeves-watcher\\dist\\cli\\jeeves-watcher\\index.js" start${configPath ? ` --config "${configPath}"` : ''}`);
|
|
1711
|
-
console.log(` nssm set ${name} AppDirectory "%CD%"`);
|
|
1712
|
-
console.log(` nssm set ${name} Start SERVICE_AUTO_START`);
|
|
1713
|
-
console.log(` nssm start ${name}`);
|
|
1714
|
-
return;
|
|
1715
|
-
}
|
|
1716
|
-
const unit = `[Unit]\nDescription=Jeeves Watcher\nAfter=network.target\n\n[Service]\nType=simple\nWorkingDirectory=%h\nExecStart=/usr/bin/env jeeves-watcher start${configPath ? ` --config ${configPath}` : ''}\nRestart=on-failure\n\n[Install]\nWantedBy=default.target\n`;
|
|
1717
|
-
console.log('# systemd unit file');
|
|
1718
|
-
console.log(`# ~/.config/systemd/user/${name}.service`);
|
|
1719
|
-
console.log(unit);
|
|
1720
|
-
console.log('# install');
|
|
1721
|
-
console.log(` systemctl --user daemon-reload`);
|
|
1722
|
-
console.log(` systemctl --user enable --now ${name}.service`);
|
|
1723
|
-
}))
|
|
1724
|
-
.addCommand(new Command('uninstall')
|
|
1725
|
-
.description('Print uninstall instructions for a system service')
|
|
1726
|
-
.option('-n, --name <name>', 'Service name', 'jeeves-watcher')
|
|
1727
|
-
.action((options) => {
|
|
1728
|
-
const name = options.name;
|
|
1729
|
-
if (process.platform === 'win32') {
|
|
1730
|
-
console.log('NSSM uninstall (example):');
|
|
1731
|
-
console.log(` nssm stop ${name}`);
|
|
1732
|
-
console.log(` nssm remove ${name} confirm`);
|
|
1733
|
-
return;
|
|
1734
|
-
}
|
|
1735
|
-
console.log('# systemd uninstall');
|
|
1736
|
-
console.log(` systemctl --user disable --now ${name}.service`);
|
|
1737
|
-
console.log(`# remove ~/.config/systemd/user/${name}.service`);
|
|
1738
|
-
console.log(` systemctl --user daemon-reload`);
|
|
1739
|
-
}));
|
|
1740
|
-
cli
|
|
1741
|
-
.command('config-reindex')
|
|
1742
|
-
.description('Reindex after configuration changes (POST /config-reindex)')
|
|
1743
|
-
.option('-s, --scope <scope>', 'Reindex scope (rules|full)', 'rules')
|
|
1744
|
-
.option('-p, --port <port>', 'API port', '3458')
|
|
1745
|
-
.option('-H, --host <host>', 'API host', '127.0.0.1')
|
|
1746
|
-
.action(async (options) => {
|
|
1747
|
-
const scope = options.scope;
|
|
1748
|
-
if (scope !== 'rules' && scope !== 'full') {
|
|
1749
|
-
console.error('Invalid scope. Must be "rules" or "full"');
|
|
1750
|
-
process.exit(1);
|
|
1751
|
-
}
|
|
1752
|
-
const url = `${apiBase(options.host, options.port)}/config-reindex`;
|
|
1753
|
-
const res = await fetch(url, {
|
|
1754
|
-
method: 'POST',
|
|
1755
|
-
headers: { 'content-type': 'application/json' },
|
|
1756
|
-
body: JSON.stringify({ scope }),
|
|
1757
|
-
});
|
|
1758
|
-
const text = await res.text();
|
|
1759
|
-
if (!res.ok) {
|
|
1760
|
-
console.error(text);
|
|
1761
|
-
process.exit(1);
|
|
1762
|
-
}
|
|
1763
|
-
console.log(text);
|
|
1764
|
-
});
|
|
2370
|
+
// API-backed commands
|
|
2371
|
+
registerStatusCommand(cli);
|
|
2372
|
+
registerReindexCommand(cli);
|
|
2373
|
+
registerRebuildMetadataCommand(cli);
|
|
2374
|
+
registerSearchCommand(cli);
|
|
2375
|
+
registerEnrichCommand(cli);
|
|
2376
|
+
registerConfigReindexCommand(cli);
|
|
2377
|
+
registerServiceCommand(cli);
|
|
1765
2378
|
cli.parse();
|