@karmaniverous/jeeves-watcher 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -16
- package/config.schema.json +577 -0
- package/dist/cjs/index.js +800 -340
- package/dist/cli/jeeves-watcher/index.js +1130 -517
- package/dist/index.d.ts +160 -103
- package/dist/index.iife.js +796 -339
- package/dist/index.iife.min.js +1 -1
- package/dist/mjs/index.js +793 -341
- package/package.json +28 -22
package/dist/mjs/index.js
CHANGED
|
@@ -1,21 +1,28 @@
|
|
|
1
1
|
import Fastify from 'fastify';
|
|
2
|
+
import { omit, get } from 'radash';
|
|
2
3
|
import { createHash } from 'node:crypto';
|
|
3
4
|
import { readFile, mkdir, writeFile, rm, readdir, stat } from 'node:fs/promises';
|
|
4
5
|
import { join, dirname, resolve, extname, basename } from 'node:path';
|
|
5
6
|
import picomatch from 'picomatch';
|
|
6
7
|
import chokidar from 'chokidar';
|
|
7
|
-
import Ajv from 'ajv';
|
|
8
8
|
import { cosmiconfig } from 'cosmiconfig';
|
|
9
|
+
import { z, ZodError } from 'zod';
|
|
10
|
+
import { jsonMapMapSchema, JsonMap } from '@karmaniverous/jsonmap';
|
|
9
11
|
import { GoogleGenerativeAIEmbeddings } from '@langchain/google-genai';
|
|
10
12
|
import pino from 'pino';
|
|
11
|
-
import {
|
|
13
|
+
import { v5 } from 'uuid';
|
|
12
14
|
import * as cheerio from 'cheerio';
|
|
13
15
|
import yaml from 'js-yaml';
|
|
14
16
|
import mammoth from 'mammoth';
|
|
15
|
-
import
|
|
17
|
+
import Ajv from 'ajv';
|
|
16
18
|
import addFormats from 'ajv-formats';
|
|
19
|
+
import { MarkdownTextSplitter, RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
|
17
20
|
import { QdrantClient } from '@qdrant/js-client-rest';
|
|
18
21
|
|
|
22
|
+
/**
|
|
23
|
+
* @module metadata/metadata
|
|
24
|
+
* Persists file metadata as .meta.json. I/O: reads/writes/deletes metadata files under metadataDir. Path mapping via SHA-256 hash.
|
|
25
|
+
*/
|
|
19
26
|
/**
|
|
20
27
|
* Normalise a file path for deterministic mapping: lowercase, forward slashes, strip leading drive letter colon.
|
|
21
28
|
*
|
|
@@ -155,6 +162,30 @@ async function listFilesFromGlobs(patterns, ignored = []) {
|
|
|
155
162
|
return Array.from(seen);
|
|
156
163
|
}
|
|
157
164
|
|
|
165
|
+
/**
|
|
166
|
+
* @module processAllFiles
|
|
167
|
+
*
|
|
168
|
+
* Shared helper for processing all files matching configured globs.
|
|
169
|
+
*/
|
|
170
|
+
/**
|
|
171
|
+
* Process all files from globs using the specified processor method.
|
|
172
|
+
*
|
|
173
|
+
* @param watchPaths - The glob patterns to match.
|
|
174
|
+
* @param ignoredPaths - The glob patterns to ignore.
|
|
175
|
+
* @param processor - The document processor instance.
|
|
176
|
+
* @param method - The processor method to call ('processFile' or 'processRulesUpdate').
|
|
177
|
+
* @returns The number of files processed.
|
|
178
|
+
*/
|
|
179
|
+
async function processAllFiles(watchPaths, ignoredPaths, processor, method) {
|
|
180
|
+
const files = await listFilesFromGlobs(watchPaths, ignoredPaths);
|
|
181
|
+
for (const file of files) {
|
|
182
|
+
// Sequential on purpose to avoid surprising load.
|
|
183
|
+
// Queue integration can come later.
|
|
184
|
+
await processor[method](file);
|
|
185
|
+
}
|
|
186
|
+
return files.length;
|
|
187
|
+
}
|
|
188
|
+
|
|
158
189
|
/**
|
|
159
190
|
* Create the Fastify API server with all routes registered.
|
|
160
191
|
*
|
|
@@ -195,15 +226,8 @@ function createApiServer(options) {
|
|
|
195
226
|
});
|
|
196
227
|
app.post('/reindex', async (_request, reply) => {
|
|
197
228
|
try {
|
|
198
|
-
const
|
|
199
|
-
|
|
200
|
-
// Sequential on purpose to avoid surprising load.
|
|
201
|
-
// Queue integration can come later.
|
|
202
|
-
await processor.processFile(file);
|
|
203
|
-
}
|
|
204
|
-
return await reply
|
|
205
|
-
.status(200)
|
|
206
|
-
.send({ ok: true, filesIndexed: files.length });
|
|
229
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processFile');
|
|
230
|
+
return await reply.status(200).send({ ok: true, filesIndexed: count });
|
|
207
231
|
}
|
|
208
232
|
catch (error) {
|
|
209
233
|
logger.error({ error }, 'Reindex failed');
|
|
@@ -213,19 +237,21 @@ function createApiServer(options) {
|
|
|
213
237
|
app.post('/rebuild-metadata', async (_request, reply) => {
|
|
214
238
|
try {
|
|
215
239
|
const metadataDir = options.config.metadataDir ?? '.jeeves-metadata';
|
|
240
|
+
const SYSTEM_KEYS = [
|
|
241
|
+
'file_path',
|
|
242
|
+
'chunk_index',
|
|
243
|
+
'total_chunks',
|
|
244
|
+
'content_hash',
|
|
245
|
+
'chunk_text',
|
|
246
|
+
];
|
|
216
247
|
for await (const point of vectorStore.scroll()) {
|
|
217
248
|
const payload = point.payload;
|
|
218
249
|
const filePath = payload['file_path'];
|
|
219
250
|
if (typeof filePath !== 'string' || filePath.length === 0)
|
|
220
251
|
continue;
|
|
221
252
|
// Persist only enrichment-ish fields, not chunking/index fields.
|
|
222
|
-
const
|
|
223
|
-
|
|
224
|
-
delete rest.chunk_index;
|
|
225
|
-
delete rest.total_chunks;
|
|
226
|
-
delete rest.content_hash;
|
|
227
|
-
delete rest.chunk_text;
|
|
228
|
-
await writeMetadata(filePath, metadataDir, rest);
|
|
253
|
+
const enrichment = omit(payload, SYSTEM_KEYS);
|
|
254
|
+
await writeMetadata(filePath, metadataDir, enrichment);
|
|
229
255
|
}
|
|
230
256
|
return await reply.status(200).send({ ok: true });
|
|
231
257
|
}
|
|
@@ -242,20 +268,13 @@ function createApiServer(options) {
|
|
|
242
268
|
try {
|
|
243
269
|
if (scope === 'rules') {
|
|
244
270
|
// Re-apply inference rules to all files, update Qdrant payloads (no re-embedding)
|
|
245
|
-
const
|
|
246
|
-
|
|
247
|
-
// Use the new processRulesUpdate method
|
|
248
|
-
await processor.processRulesUpdate(file);
|
|
249
|
-
}
|
|
250
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (rules) completed');
|
|
271
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processRulesUpdate');
|
|
272
|
+
logger.info({ scope, filesProcessed: count }, 'Config reindex (rules) completed');
|
|
251
273
|
}
|
|
252
274
|
else {
|
|
253
275
|
// Full reindex: re-extract, re-embed, re-upsert
|
|
254
|
-
const
|
|
255
|
-
|
|
256
|
-
await processor.processFile(file);
|
|
257
|
-
}
|
|
258
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (full) completed');
|
|
276
|
+
const count = await processAllFiles(options.config.watch.paths, options.config.watch.ignored, processor, 'processFile');
|
|
277
|
+
logger.info({ scope, filesProcessed: count }, 'Config reindex (full) completed');
|
|
259
278
|
}
|
|
260
279
|
}
|
|
261
280
|
catch (error) {
|
|
@@ -272,117 +291,249 @@ function createApiServer(options) {
|
|
|
272
291
|
return app;
|
|
273
292
|
}
|
|
274
293
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
watch: {
|
|
282
|
-
type: 'object',
|
|
283
|
-
required: ['paths'],
|
|
284
|
-
properties: {
|
|
285
|
-
paths: { type: 'array', items: { type: 'string' }, minItems: 1 },
|
|
286
|
-
ignored: { type: 'array', items: { type: 'string' } },
|
|
287
|
-
pollIntervalMs: { type: 'number' },
|
|
288
|
-
usePolling: { type: 'boolean' },
|
|
289
|
-
debounceMs: { type: 'number' },
|
|
290
|
-
stabilityThresholdMs: { type: 'number' },
|
|
291
|
-
},
|
|
292
|
-
additionalProperties: false,
|
|
293
|
-
},
|
|
294
|
-
configWatch: {
|
|
295
|
-
type: 'object',
|
|
296
|
-
properties: {
|
|
297
|
-
enabled: { type: 'boolean' },
|
|
298
|
-
debounceMs: { type: 'number' },
|
|
299
|
-
},
|
|
300
|
-
additionalProperties: false,
|
|
301
|
-
},
|
|
302
|
-
embedding: {
|
|
303
|
-
type: 'object',
|
|
304
|
-
required: ['provider', 'model'],
|
|
305
|
-
properties: {
|
|
306
|
-
provider: { type: 'string' },
|
|
307
|
-
model: { type: 'string' },
|
|
308
|
-
chunkSize: { type: 'number' },
|
|
309
|
-
chunkOverlap: { type: 'number' },
|
|
310
|
-
dimensions: { type: 'number' },
|
|
311
|
-
apiKey: { type: 'string' },
|
|
312
|
-
rateLimitPerMinute: { type: 'number' },
|
|
313
|
-
concurrency: { type: 'number' },
|
|
314
|
-
},
|
|
315
|
-
additionalProperties: false,
|
|
316
|
-
},
|
|
317
|
-
vectorStore: {
|
|
318
|
-
type: 'object',
|
|
319
|
-
required: ['url', 'collectionName'],
|
|
320
|
-
properties: {
|
|
321
|
-
url: { type: 'string' },
|
|
322
|
-
collectionName: { type: 'string' },
|
|
323
|
-
apiKey: { type: 'string' },
|
|
324
|
-
},
|
|
325
|
-
additionalProperties: false,
|
|
326
|
-
},
|
|
327
|
-
metadataDir: { type: 'string' },
|
|
328
|
-
api: {
|
|
329
|
-
type: 'object',
|
|
330
|
-
properties: {
|
|
331
|
-
host: { type: 'string' },
|
|
332
|
-
port: { type: 'number' },
|
|
333
|
-
},
|
|
334
|
-
additionalProperties: false,
|
|
335
|
-
},
|
|
336
|
-
extractors: { type: 'object' },
|
|
337
|
-
inferenceRules: {
|
|
338
|
-
type: 'array',
|
|
339
|
-
items: {
|
|
340
|
-
type: 'object',
|
|
341
|
-
required: ['match', 'set'],
|
|
342
|
-
properties: {
|
|
343
|
-
match: { type: 'object' },
|
|
344
|
-
set: { type: 'object' },
|
|
345
|
-
},
|
|
346
|
-
additionalProperties: false,
|
|
347
|
-
},
|
|
348
|
-
},
|
|
349
|
-
logging: {
|
|
350
|
-
type: 'object',
|
|
351
|
-
properties: {
|
|
352
|
-
level: { type: 'string' },
|
|
353
|
-
file: { type: 'string' },
|
|
354
|
-
},
|
|
355
|
-
additionalProperties: false,
|
|
356
|
-
},
|
|
357
|
-
shutdownTimeoutMs: { type: 'number' },
|
|
358
|
-
},
|
|
359
|
-
additionalProperties: false,
|
|
360
|
-
};
|
|
361
|
-
const ajv = new Ajv({ allErrors: true });
|
|
362
|
-
const validate = ajv.compile(configSchema);
|
|
363
|
-
/** Default values for optional configuration fields. */
|
|
364
|
-
const DEFAULTS = {
|
|
365
|
-
configWatch: { enabled: true, debounceMs: 1000 },
|
|
294
|
+
/**
|
|
295
|
+
* @module config/defaults
|
|
296
|
+
* Default configuration values for jeeves-watcher. Pure data export, no I/O or side effects.
|
|
297
|
+
*/
|
|
298
|
+
/** Default root-level config values. */
|
|
299
|
+
const ROOT_DEFAULTS = {
|
|
366
300
|
metadataDir: '.jeeves-watcher',
|
|
367
|
-
api: { host: '127.0.0.1', port: 3100 },
|
|
368
|
-
logging: { level: 'info' },
|
|
369
301
|
shutdownTimeoutMs: 10000,
|
|
370
302
|
};
|
|
371
|
-
/** Default values
|
|
303
|
+
/** Default configWatch values. */
|
|
304
|
+
const CONFIG_WATCH_DEFAULTS = {
|
|
305
|
+
enabled: true,
|
|
306
|
+
debounceMs: 1000,
|
|
307
|
+
};
|
|
308
|
+
/** Default API values. */
|
|
309
|
+
const API_DEFAULTS = {
|
|
310
|
+
host: '127.0.0.1',
|
|
311
|
+
port: 3456,
|
|
312
|
+
};
|
|
313
|
+
/** Default logging values. */
|
|
314
|
+
const LOGGING_DEFAULTS = {
|
|
315
|
+
level: 'info',
|
|
316
|
+
};
|
|
317
|
+
/** Default watch configuration. */
|
|
372
318
|
const WATCH_DEFAULTS = {
|
|
373
319
|
debounceMs: 300,
|
|
374
320
|
stabilityThresholdMs: 500,
|
|
375
321
|
usePolling: false,
|
|
376
322
|
pollIntervalMs: 1000,
|
|
377
323
|
};
|
|
378
|
-
/** Default
|
|
324
|
+
/** Default embedding configuration. */
|
|
379
325
|
const EMBEDDING_DEFAULTS = {
|
|
380
326
|
chunkSize: 1000,
|
|
381
327
|
chunkOverlap: 200,
|
|
382
|
-
dimensions:
|
|
328
|
+
dimensions: 3072,
|
|
383
329
|
rateLimitPerMinute: 300,
|
|
384
330
|
concurrency: 5,
|
|
385
331
|
};
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Watch configuration for file system monitoring.
|
|
335
|
+
*/
|
|
336
|
+
const watchConfigSchema = z.object({
|
|
337
|
+
/** Glob patterns to watch. */
|
|
338
|
+
paths: z
|
|
339
|
+
.array(z.string())
|
|
340
|
+
.min(1)
|
|
341
|
+
.describe('Glob patterns for files to watch (e.g., "**/*.md"). At least one required.'),
|
|
342
|
+
/** Glob patterns to ignore. */
|
|
343
|
+
ignored: z
|
|
344
|
+
.array(z.string())
|
|
345
|
+
.optional()
|
|
346
|
+
.describe('Glob patterns to exclude from watching (e.g., "**/node_modules/**").'),
|
|
347
|
+
/** Polling interval in milliseconds. */
|
|
348
|
+
pollIntervalMs: z
|
|
349
|
+
.number()
|
|
350
|
+
.optional()
|
|
351
|
+
.describe('Polling interval in milliseconds when usePolling is enabled.'),
|
|
352
|
+
/** Whether to use polling instead of native watchers. */
|
|
353
|
+
usePolling: z
|
|
354
|
+
.boolean()
|
|
355
|
+
.optional()
|
|
356
|
+
.describe('Use polling instead of native file system events (for network drives).'),
|
|
357
|
+
/** Debounce delay in milliseconds for file change events. */
|
|
358
|
+
debounceMs: z
|
|
359
|
+
.number()
|
|
360
|
+
.optional()
|
|
361
|
+
.describe('Debounce delay in milliseconds for file change events.'),
|
|
362
|
+
/** Time in milliseconds a file must be stable before processing. */
|
|
363
|
+
stabilityThresholdMs: z
|
|
364
|
+
.number()
|
|
365
|
+
.optional()
|
|
366
|
+
.describe('Time in milliseconds a file must remain unchanged before processing.'),
|
|
367
|
+
});
|
|
368
|
+
/**
|
|
369
|
+
* Configuration watch settings.
|
|
370
|
+
*/
|
|
371
|
+
const configWatchConfigSchema = z.object({
|
|
372
|
+
/** Whether config file watching is enabled. */
|
|
373
|
+
enabled: z
|
|
374
|
+
.boolean()
|
|
375
|
+
.optional()
|
|
376
|
+
.describe('Enable automatic reloading when config file changes.'),
|
|
377
|
+
/** Debounce delay in milliseconds for config change events. */
|
|
378
|
+
debounceMs: z
|
|
379
|
+
.number()
|
|
380
|
+
.optional()
|
|
381
|
+
.describe('Debounce delay in milliseconds for config file change detection.'),
|
|
382
|
+
});
|
|
383
|
+
/**
|
|
384
|
+
* Embedding model configuration.
|
|
385
|
+
*/
|
|
386
|
+
const embeddingConfigSchema = z.object({
|
|
387
|
+
/** The embedding model provider. */
|
|
388
|
+
provider: z
|
|
389
|
+
.string()
|
|
390
|
+
.default('gemini')
|
|
391
|
+
.describe('Embedding provider name (e.g., "gemini", "openai").'),
|
|
392
|
+
/** The embedding model name. */
|
|
393
|
+
model: z
|
|
394
|
+
.string()
|
|
395
|
+
.default('gemini-embedding-001')
|
|
396
|
+
.describe('Embedding model identifier (e.g., "gemini-embedding-001", "text-embedding-3-small").'),
|
|
397
|
+
/** Maximum tokens per chunk for splitting. */
|
|
398
|
+
chunkSize: z
|
|
399
|
+
.number()
|
|
400
|
+
.optional()
|
|
401
|
+
.describe('Maximum chunk size in characters for text splitting.'),
|
|
402
|
+
/** Overlap between chunks in tokens. */
|
|
403
|
+
chunkOverlap: z
|
|
404
|
+
.number()
|
|
405
|
+
.optional()
|
|
406
|
+
.describe('Character overlap between consecutive chunks.'),
|
|
407
|
+
/** Embedding vector dimensions. */
|
|
408
|
+
dimensions: z
|
|
409
|
+
.number()
|
|
410
|
+
.optional()
|
|
411
|
+
.describe('Embedding vector dimensions (must match model output).'),
|
|
412
|
+
/** API key for the embedding provider. */
|
|
413
|
+
apiKey: z
|
|
414
|
+
.string()
|
|
415
|
+
.optional()
|
|
416
|
+
.describe('API key for embedding provider (supports ${ENV_VAR} substitution).'),
|
|
417
|
+
/** Maximum embedding requests per minute. */
|
|
418
|
+
rateLimitPerMinute: z
|
|
419
|
+
.number()
|
|
420
|
+
.optional()
|
|
421
|
+
.describe('Maximum embedding API requests per minute (rate limiting).'),
|
|
422
|
+
/** Maximum concurrent embedding requests. */
|
|
423
|
+
concurrency: z
|
|
424
|
+
.number()
|
|
425
|
+
.optional()
|
|
426
|
+
.describe('Maximum concurrent embedding requests.'),
|
|
427
|
+
});
|
|
428
|
+
/**
|
|
429
|
+
* Vector store configuration for Qdrant.
|
|
430
|
+
*/
|
|
431
|
+
const vectorStoreConfigSchema = z.object({
|
|
432
|
+
/** Qdrant server URL. */
|
|
433
|
+
url: z
|
|
434
|
+
.string()
|
|
435
|
+
.describe('Qdrant server URL (e.g., "http://localhost:6333").'),
|
|
436
|
+
/** Qdrant collection name. */
|
|
437
|
+
collectionName: z
|
|
438
|
+
.string()
|
|
439
|
+
.describe('Qdrant collection name for vector storage.'),
|
|
440
|
+
/** Qdrant API key. */
|
|
441
|
+
apiKey: z
|
|
442
|
+
.string()
|
|
443
|
+
.optional()
|
|
444
|
+
.describe('Qdrant API key for authentication (supports ${ENV_VAR} substitution).'),
|
|
445
|
+
});
|
|
446
|
+
/**
|
|
447
|
+
* API server configuration.
|
|
448
|
+
*/
|
|
449
|
+
const apiConfigSchema = z.object({
|
|
450
|
+
/** Host to bind to. */
|
|
451
|
+
host: z
|
|
452
|
+
.string()
|
|
453
|
+
.optional()
|
|
454
|
+
.describe('Host address for API server (e.g., "127.0.0.1", "0.0.0.0").'),
|
|
455
|
+
/** Port to listen on. */
|
|
456
|
+
port: z.number().optional().describe('Port for API server (e.g., 3456).'),
|
|
457
|
+
});
|
|
458
|
+
/**
|
|
459
|
+
* Logging configuration.
|
|
460
|
+
*/
|
|
461
|
+
const loggingConfigSchema = z.object({
|
|
462
|
+
/** Log level. */
|
|
463
|
+
level: z
|
|
464
|
+
.string()
|
|
465
|
+
.optional()
|
|
466
|
+
.describe('Logging level (trace, debug, info, warn, error, fatal).'),
|
|
467
|
+
/** Log file path. */
|
|
468
|
+
file: z
|
|
469
|
+
.string()
|
|
470
|
+
.optional()
|
|
471
|
+
.describe('Path to log file (logs to stdout if omitted).'),
|
|
472
|
+
});
|
|
473
|
+
/**
|
|
474
|
+
* An inference rule that enriches document metadata.
|
|
475
|
+
*/
|
|
476
|
+
const inferenceRuleSchema = z.object({
|
|
477
|
+
/** JSON Schema object to match against document metadata. */
|
|
478
|
+
match: z
|
|
479
|
+
.record(z.string(), z.unknown())
|
|
480
|
+
.describe('JSON Schema object to match against file attributes.'),
|
|
481
|
+
/** Metadata fields to set when the rule matches. */
|
|
482
|
+
set: z
|
|
483
|
+
.record(z.string(), z.unknown())
|
|
484
|
+
.describe('Metadata fields to set when match succeeds.'),
|
|
485
|
+
/** JsonMap transformation (inline or reference to named map). */
|
|
486
|
+
map: z
|
|
487
|
+
.union([jsonMapMapSchema, z.string()])
|
|
488
|
+
.optional()
|
|
489
|
+
.describe('JsonMap transformation (inline definition or named map reference).'),
|
|
490
|
+
});
|
|
491
|
+
/**
|
|
492
|
+
* Top-level configuration for jeeves-watcher.
|
|
493
|
+
*/
|
|
494
|
+
const jeevesWatcherConfigSchema = z.object({
|
|
495
|
+
/** File system watch configuration. */
|
|
496
|
+
watch: watchConfigSchema.describe('File system watch configuration.'),
|
|
497
|
+
/** Configuration file watch settings. */
|
|
498
|
+
configWatch: configWatchConfigSchema
|
|
499
|
+
.optional()
|
|
500
|
+
.describe('Configuration file watch settings.'),
|
|
501
|
+
/** Embedding model configuration. */
|
|
502
|
+
embedding: embeddingConfigSchema.describe('Embedding model configuration.'),
|
|
503
|
+
/** Vector store configuration. */
|
|
504
|
+
vectorStore: vectorStoreConfigSchema.describe('Qdrant vector store configuration.'),
|
|
505
|
+
/** Directory for persisted metadata. */
|
|
506
|
+
metadataDir: z
|
|
507
|
+
.string()
|
|
508
|
+
.optional()
|
|
509
|
+
.describe('Directory for persisted metadata sidecar files.'),
|
|
510
|
+
/** API server configuration. */
|
|
511
|
+
api: apiConfigSchema.optional().describe('API server configuration.'),
|
|
512
|
+
/** Extractor configurations keyed by name. */
|
|
513
|
+
extractors: z
|
|
514
|
+
.record(z.string(), z.unknown())
|
|
515
|
+
.optional()
|
|
516
|
+
.describe('Extractor configurations keyed by name.'),
|
|
517
|
+
/** Rules for inferring metadata from document properties. */
|
|
518
|
+
inferenceRules: z
|
|
519
|
+
.array(inferenceRuleSchema)
|
|
520
|
+
.optional()
|
|
521
|
+
.describe('Rules for inferring metadata from file attributes.'),
|
|
522
|
+
/** Reusable named JsonMap transformations. */
|
|
523
|
+
maps: z
|
|
524
|
+
.record(z.string(), jsonMapMapSchema)
|
|
525
|
+
.optional()
|
|
526
|
+
.describe('Reusable named JsonMap transformations.'),
|
|
527
|
+
/** Logging configuration. */
|
|
528
|
+
logging: loggingConfigSchema.optional().describe('Logging configuration.'),
|
|
529
|
+
/** Timeout in milliseconds for graceful shutdown. */
|
|
530
|
+
shutdownTimeoutMs: z
|
|
531
|
+
.number()
|
|
532
|
+
.optional()
|
|
533
|
+
.describe('Timeout in milliseconds for graceful shutdown.'),
|
|
534
|
+
});
|
|
535
|
+
|
|
536
|
+
const MODULE_NAME = 'jeeves-watcher';
|
|
386
537
|
/**
|
|
387
538
|
* Merge sensible defaults into a loaded configuration.
|
|
388
539
|
*
|
|
@@ -391,13 +542,13 @@ const EMBEDDING_DEFAULTS = {
|
|
|
391
542
|
*/
|
|
392
543
|
function applyDefaults(raw) {
|
|
393
544
|
return {
|
|
394
|
-
...
|
|
545
|
+
...ROOT_DEFAULTS,
|
|
395
546
|
...raw,
|
|
396
547
|
watch: { ...WATCH_DEFAULTS, ...raw.watch },
|
|
397
|
-
configWatch: { ...
|
|
548
|
+
configWatch: { ...CONFIG_WATCH_DEFAULTS, ...raw.configWatch },
|
|
398
549
|
embedding: { ...EMBEDDING_DEFAULTS, ...raw.embedding },
|
|
399
|
-
api: { ...
|
|
400
|
-
logging: { ...
|
|
550
|
+
api: { ...API_DEFAULTS, ...raw.api },
|
|
551
|
+
logging: { ...LOGGING_DEFAULTS, ...raw.logging },
|
|
401
552
|
};
|
|
402
553
|
}
|
|
403
554
|
/**
|
|
@@ -415,21 +566,89 @@ async function loadConfig(configPath) {
|
|
|
415
566
|
if (!result || result.isEmpty) {
|
|
416
567
|
throw new Error('No jeeves-watcher configuration found. Create a .jeeves-watcherrc or jeeves-watcher.config.{js,ts,json,yaml} file.');
|
|
417
568
|
}
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
569
|
+
try {
|
|
570
|
+
const validated = jeevesWatcherConfigSchema.parse(result.config);
|
|
571
|
+
return applyDefaults(validated);
|
|
572
|
+
}
|
|
573
|
+
catch (error) {
|
|
574
|
+
if (error instanceof ZodError) {
|
|
575
|
+
const errors = error.issues
|
|
576
|
+
.map((issue) => `${issue.path.join('.')}: ${issue.message}`)
|
|
577
|
+
.join('; ');
|
|
578
|
+
throw new Error(`Invalid jeeves-watcher configuration: ${errors}`);
|
|
579
|
+
}
|
|
580
|
+
throw error;
|
|
429
581
|
}
|
|
430
|
-
return applyDefaults(raw);
|
|
431
582
|
}
|
|
432
583
|
|
|
584
|
+
/**
|
|
585
|
+
* @module util/retry
|
|
586
|
+
* Small async retry helper with exponential backoff. Side effects: sleeps between attempts; can invoke onRetry callback for logging.
|
|
587
|
+
*/
|
|
588
|
+
function sleep(ms, signal) {
|
|
589
|
+
if (ms <= 0)
|
|
590
|
+
return Promise.resolve();
|
|
591
|
+
return new Promise((resolve, reject) => {
|
|
592
|
+
const timer = setTimeout(() => {
|
|
593
|
+
cleanup();
|
|
594
|
+
resolve();
|
|
595
|
+
}, ms);
|
|
596
|
+
const onAbort = () => {
|
|
597
|
+
cleanup();
|
|
598
|
+
reject(new Error('Retry sleep aborted'));
|
|
599
|
+
};
|
|
600
|
+
const cleanup = () => {
|
|
601
|
+
clearTimeout(timer);
|
|
602
|
+
if (signal)
|
|
603
|
+
signal.removeEventListener('abort', onAbort);
|
|
604
|
+
};
|
|
605
|
+
if (signal) {
|
|
606
|
+
if (signal.aborted) {
|
|
607
|
+
onAbort();
|
|
608
|
+
return;
|
|
609
|
+
}
|
|
610
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
611
|
+
}
|
|
612
|
+
});
|
|
613
|
+
}
|
|
614
|
+
function computeDelayMs(attempt, baseDelayMs, maxDelayMs, jitter = 0) {
|
|
615
|
+
const exp = Math.max(0, attempt - 1);
|
|
616
|
+
const raw = Math.min(maxDelayMs, baseDelayMs * 2 ** exp);
|
|
617
|
+
const factor = jitter > 0 ? 1 + Math.random() * jitter : 1;
|
|
618
|
+
return Math.round(raw * factor);
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Retry an async operation using exponential backoff.
|
|
622
|
+
*
|
|
623
|
+
* @param fn - Operation to execute.
|
|
624
|
+
* @param options - Retry policy.
|
|
625
|
+
* @returns The operation result.
|
|
626
|
+
*/
|
|
627
|
+
async function retry(fn, options) {
|
|
628
|
+
const attempts = Math.max(1, options.attempts);
|
|
629
|
+
let lastError;
|
|
630
|
+
for (let attempt = 1; attempt <= attempts; attempt++) {
|
|
631
|
+
try {
|
|
632
|
+
return await fn(attempt);
|
|
633
|
+
}
|
|
634
|
+
catch (error) {
|
|
635
|
+
lastError = error;
|
|
636
|
+
const isLast = attempt >= attempts;
|
|
637
|
+
if (isLast)
|
|
638
|
+
break;
|
|
639
|
+
const delayMs = computeDelayMs(attempt, options.baseDelayMs, options.maxDelayMs, options.jitter);
|
|
640
|
+
options.onRetry?.({ attempt, attempts, delayMs, error });
|
|
641
|
+
await sleep(delayMs, options.signal);
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
throw lastError;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
/**
|
|
648
|
+
* @module embedding
|
|
649
|
+
*
|
|
650
|
+
* Embedding provider abstractions and registry-backed factory.
|
|
651
|
+
*/
|
|
433
652
|
/**
|
|
434
653
|
* Create a mock embedding provider that generates deterministic vectors from content hashes.
|
|
435
654
|
*
|
|
@@ -457,10 +676,11 @@ function createMockProvider(dimensions) {
|
|
|
457
676
|
* Create a Gemini embedding provider using the Google Generative AI SDK.
|
|
458
677
|
*
|
|
459
678
|
* @param config - The embedding configuration.
|
|
679
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
460
680
|
* @returns A Gemini {@link EmbeddingProvider}.
|
|
461
681
|
* @throws If the API key is missing.
|
|
462
682
|
*/
|
|
463
|
-
function createGeminiProvider(config) {
|
|
683
|
+
function createGeminiProvider(config, logger) {
|
|
464
684
|
if (!config.apiKey) {
|
|
465
685
|
throw new Error('Gemini embedding provider requires config.embedding.apiKey');
|
|
466
686
|
}
|
|
@@ -472,8 +692,43 @@ function createGeminiProvider(config) {
|
|
|
472
692
|
return {
|
|
473
693
|
dimensions,
|
|
474
694
|
async embed(texts) {
|
|
475
|
-
|
|
476
|
-
|
|
695
|
+
const vectors = await retry(async (attempt) => {
|
|
696
|
+
if (attempt > 1) {
|
|
697
|
+
const msg = {
|
|
698
|
+
attempt,
|
|
699
|
+
provider: 'gemini',
|
|
700
|
+
model: config.model,
|
|
701
|
+
};
|
|
702
|
+
if (logger) {
|
|
703
|
+
logger.warn(msg, 'Retrying embedding request');
|
|
704
|
+
}
|
|
705
|
+
else {
|
|
706
|
+
console.warn(msg, 'Retrying embedding request');
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
// embedDocuments returns vectors for multiple texts
|
|
710
|
+
return embedder.embedDocuments(texts);
|
|
711
|
+
}, {
|
|
712
|
+
attempts: 5,
|
|
713
|
+
baseDelayMs: 500,
|
|
714
|
+
maxDelayMs: 10_000,
|
|
715
|
+
jitter: 0.2,
|
|
716
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
717
|
+
const msg = {
|
|
718
|
+
attempt,
|
|
719
|
+
delayMs,
|
|
720
|
+
provider: 'gemini',
|
|
721
|
+
model: config.model,
|
|
722
|
+
error,
|
|
723
|
+
};
|
|
724
|
+
if (logger) {
|
|
725
|
+
logger.warn(msg, 'Embedding call failed; will retry');
|
|
726
|
+
}
|
|
727
|
+
else {
|
|
728
|
+
console.warn(msg, 'Embedding call failed; will retry');
|
|
729
|
+
}
|
|
730
|
+
},
|
|
731
|
+
});
|
|
477
732
|
// Validate dimensions
|
|
478
733
|
for (const vector of vectors) {
|
|
479
734
|
if (vector.length !== dimensions) {
|
|
@@ -484,25 +739,36 @@ function createGeminiProvider(config) {
|
|
|
484
739
|
},
|
|
485
740
|
};
|
|
486
741
|
}
|
|
742
|
+
function createMockFromConfig(config) {
|
|
743
|
+
const dimensions = config.dimensions ?? 768;
|
|
744
|
+
return createMockProvider(dimensions);
|
|
745
|
+
}
|
|
746
|
+
const embeddingProviderRegistry = new Map([
|
|
747
|
+
['mock', createMockFromConfig],
|
|
748
|
+
['gemini', createGeminiProvider],
|
|
749
|
+
]);
|
|
487
750
|
/**
|
|
488
751
|
* Create an embedding provider based on the given configuration.
|
|
489
752
|
*
|
|
753
|
+
* Each provider is responsible for its own default dimensions.
|
|
754
|
+
*
|
|
490
755
|
* @param config - The embedding configuration.
|
|
756
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
491
757
|
* @returns An {@link EmbeddingProvider} instance.
|
|
492
758
|
* @throws If the configured provider is not supported.
|
|
493
759
|
*/
|
|
494
|
-
function createEmbeddingProvider(config) {
|
|
495
|
-
const
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
return createMockProvider(dimensions);
|
|
499
|
-
case 'gemini':
|
|
500
|
-
return createGeminiProvider(config);
|
|
501
|
-
default:
|
|
502
|
-
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
760
|
+
function createEmbeddingProvider(config, logger) {
|
|
761
|
+
const factory = embeddingProviderRegistry.get(config.provider);
|
|
762
|
+
if (!factory) {
|
|
763
|
+
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
503
764
|
}
|
|
765
|
+
return factory(config, logger);
|
|
504
766
|
}
|
|
505
767
|
|
|
768
|
+
/**
|
|
769
|
+
* @module logger
|
|
770
|
+
* Creates pino logger instances. I/O: optionally writes logs to file via pino/file transport. Defaults to stdout at info level.
|
|
771
|
+
*/
|
|
506
772
|
/**
|
|
507
773
|
* Create a pino logger instance.
|
|
508
774
|
*
|
|
@@ -521,6 +787,54 @@ function createLogger(config) {
|
|
|
521
787
|
return pino({ level });
|
|
522
788
|
}
|
|
523
789
|
|
|
790
|
+
/**
|
|
791
|
+
* @module hash
|
|
792
|
+
* Provides SHA-256 content hashing. Pure function: given text string, returns hex digest. No I/O or side effects.
|
|
793
|
+
*/
|
|
794
|
+
/**
|
|
795
|
+
* Compute a SHA-256 hex digest of the given text.
|
|
796
|
+
*
|
|
797
|
+
* @param text - The input text to hash.
|
|
798
|
+
* @returns The hex-encoded SHA-256 hash.
|
|
799
|
+
*/
|
|
800
|
+
function contentHash(text) {
|
|
801
|
+
return createHash('sha256').update(text, 'utf8').digest('hex');
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
/**
|
|
805
|
+
* @module pointId
|
|
806
|
+
* Generates deterministic UUIDv5 point IDs for file paths and chunk indices. Pure function: normalizes paths, returns stable IDs. No I/O.
|
|
807
|
+
*/
|
|
808
|
+
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
809
|
+
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
810
|
+
/**
|
|
811
|
+
* Normalise a file path for deterministic point ID generation.
|
|
812
|
+
*
|
|
813
|
+
* @param filePath - The original file path.
|
|
814
|
+
* @returns The normalised path string.
|
|
815
|
+
*/
|
|
816
|
+
function normalisePath(filePath) {
|
|
817
|
+
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
818
|
+
}
|
|
819
|
+
/**
|
|
820
|
+
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
821
|
+
*
|
|
822
|
+
* @param filePath - The file path.
|
|
823
|
+
* @param chunkIndex - Optional chunk index within the file.
|
|
824
|
+
* @returns A deterministic UUID v5 string.
|
|
825
|
+
*/
|
|
826
|
+
function pointId(filePath, chunkIndex) {
|
|
827
|
+
const key = chunkIndex !== undefined
|
|
828
|
+
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
829
|
+
: normalisePath(filePath);
|
|
830
|
+
return v5(key, NAMESPACE);
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
/**
|
|
834
|
+
* @module extractors
|
|
835
|
+
*
|
|
836
|
+
* Text extraction registry for supported file formats.
|
|
837
|
+
*/
|
|
524
838
|
/**
|
|
525
839
|
* Extract YAML frontmatter from a Markdown document.
|
|
526
840
|
*
|
|
@@ -566,6 +880,55 @@ function extractJsonText(obj) {
|
|
|
566
880
|
}
|
|
567
881
|
return JSON.stringify(obj);
|
|
568
882
|
}
|
|
883
|
+
async function extractMarkdown(filePath) {
|
|
884
|
+
const raw = await readFile(filePath, 'utf8');
|
|
885
|
+
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
886
|
+
return { text: body, frontmatter };
|
|
887
|
+
}
|
|
888
|
+
async function extractPlaintext(filePath) {
|
|
889
|
+
const raw = await readFile(filePath, 'utf8');
|
|
890
|
+
return { text: raw };
|
|
891
|
+
}
|
|
892
|
+
async function extractJson(filePath) {
|
|
893
|
+
const raw = await readFile(filePath, 'utf8');
|
|
894
|
+
const parsed = JSON.parse(raw);
|
|
895
|
+
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
896
|
+
? parsed
|
|
897
|
+
: undefined;
|
|
898
|
+
return { text: extractJsonText(parsed), json };
|
|
899
|
+
}
|
|
900
|
+
async function extractPdf(filePath) {
|
|
901
|
+
const buffer = await readFile(filePath);
|
|
902
|
+
const uint8Array = new Uint8Array(buffer);
|
|
903
|
+
const { extractText: extractPdfText } = await import('unpdf');
|
|
904
|
+
const { text } = await extractPdfText(uint8Array);
|
|
905
|
+
// unpdf returns an array of strings (one per page)
|
|
906
|
+
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
907
|
+
return { text: content };
|
|
908
|
+
}
|
|
909
|
+
async function extractDocx(filePath) {
|
|
910
|
+
const buffer = await readFile(filePath);
|
|
911
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
912
|
+
return { text: result.value };
|
|
913
|
+
}
|
|
914
|
+
async function extractHtml(filePath) {
|
|
915
|
+
const raw = await readFile(filePath, 'utf8');
|
|
916
|
+
const $ = cheerio.load(raw);
|
|
917
|
+
$('script, style').remove();
|
|
918
|
+
const text = $('body').text().trim() || $.text().trim();
|
|
919
|
+
return { text };
|
|
920
|
+
}
|
|
921
|
+
const extractorRegistry = new Map([
|
|
922
|
+
['.md', extractMarkdown],
|
|
923
|
+
['.markdown', extractMarkdown],
|
|
924
|
+
['.txt', extractPlaintext],
|
|
925
|
+
['.text', extractPlaintext],
|
|
926
|
+
['.json', extractJson],
|
|
927
|
+
['.pdf', extractPdf],
|
|
928
|
+
['.docx', extractDocx],
|
|
929
|
+
['.html', extractHtml],
|
|
930
|
+
['.htm', extractHtml],
|
|
931
|
+
]);
|
|
569
932
|
/**
|
|
570
933
|
* Extract text from a file based on extension.
|
|
571
934
|
*
|
|
@@ -574,85 +937,11 @@ function extractJsonText(obj) {
|
|
|
574
937
|
* @returns Extracted text and optional structured data.
|
|
575
938
|
*/
|
|
576
939
|
async function extractText(filePath, extension) {
|
|
577
|
-
const
|
|
578
|
-
if (
|
|
579
|
-
|
|
580
|
-
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
581
|
-
return { text: body, frontmatter };
|
|
582
|
-
}
|
|
583
|
-
if (ext === '.txt' || ext === '.text') {
|
|
584
|
-
const raw = await readFile(filePath, 'utf8');
|
|
585
|
-
return { text: raw };
|
|
586
|
-
}
|
|
587
|
-
if (ext === '.json') {
|
|
588
|
-
const raw = await readFile(filePath, 'utf8');
|
|
589
|
-
const parsed = JSON.parse(raw);
|
|
590
|
-
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
591
|
-
? parsed
|
|
592
|
-
: undefined;
|
|
593
|
-
return { text: extractJsonText(parsed), json };
|
|
594
|
-
}
|
|
595
|
-
if (ext === '.pdf') {
|
|
596
|
-
const buffer = await readFile(filePath);
|
|
597
|
-
const uint8Array = new Uint8Array(buffer);
|
|
598
|
-
const { extractText: extractPdfText } = await import('unpdf');
|
|
599
|
-
const { text } = await extractPdfText(uint8Array);
|
|
600
|
-
// unpdf returns an array of strings (one per page)
|
|
601
|
-
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
602
|
-
return { text: content };
|
|
603
|
-
}
|
|
604
|
-
if (ext === '.docx') {
|
|
605
|
-
const buffer = await readFile(filePath);
|
|
606
|
-
const result = await mammoth.extractRawText({ buffer });
|
|
607
|
-
return { text: result.value };
|
|
608
|
-
}
|
|
609
|
-
if (ext === '.html' || ext === '.htm') {
|
|
610
|
-
const raw = await readFile(filePath, 'utf8');
|
|
611
|
-
const $ = cheerio.load(raw);
|
|
612
|
-
// Remove script and style elements
|
|
613
|
-
$('script, style').remove();
|
|
614
|
-
// Extract text content
|
|
615
|
-
const text = $('body').text().trim() || $.text().trim();
|
|
616
|
-
return { text };
|
|
617
|
-
}
|
|
940
|
+
const extractor = extractorRegistry.get(extension.toLowerCase());
|
|
941
|
+
if (extractor)
|
|
942
|
+
return extractor(filePath);
|
|
618
943
|
// Default: treat as plaintext.
|
|
619
|
-
|
|
620
|
-
return { text: raw };
|
|
621
|
-
}
|
|
622
|
-
|
|
623
|
-
/**
|
|
624
|
-
* Compute a SHA-256 hex digest of the given text.
|
|
625
|
-
*
|
|
626
|
-
* @param text - The input text to hash.
|
|
627
|
-
* @returns The hex-encoded SHA-256 hash.
|
|
628
|
-
*/
|
|
629
|
-
function contentHash(text) {
|
|
630
|
-
return createHash('sha256').update(text, 'utf8').digest('hex');
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
634
|
-
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
635
|
-
/**
|
|
636
|
-
* Normalise a file path for deterministic point ID generation.
|
|
637
|
-
*
|
|
638
|
-
* @param filePath - The original file path.
|
|
639
|
-
* @returns The normalised path string.
|
|
640
|
-
*/
|
|
641
|
-
function normalisePath(filePath) {
|
|
642
|
-
return filePath.replace(/\\/g, '/').toLowerCase();
|
|
643
|
-
}
|
|
644
|
-
/**
|
|
645
|
-
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
646
|
-
*
|
|
647
|
-
* @param filePath - The file path.
|
|
648
|
-
* @param chunkIndex - Optional chunk index within the file.
|
|
649
|
-
* @returns A deterministic UUID v5 string.
|
|
650
|
-
*/
|
|
651
|
-
function pointId(filePath, chunkIndex) {
|
|
652
|
-
const key = chunkIndex !== undefined
|
|
653
|
-
? `${normalisePath(filePath)}#${String(chunkIndex)}`
|
|
654
|
-
: normalisePath(filePath);
|
|
655
|
-
return v5(key, NAMESPACE);
|
|
944
|
+
return extractPlaintext(filePath);
|
|
656
945
|
}
|
|
657
946
|
|
|
658
947
|
/**
|
|
@@ -725,13 +1014,7 @@ function resolveTemplateVars(value, attributes) {
|
|
|
725
1014
|
if (typeof value !== 'string')
|
|
726
1015
|
return value;
|
|
727
1016
|
return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
|
|
728
|
-
const
|
|
729
|
-
let current = attributes;
|
|
730
|
-
for (const part of parts) {
|
|
731
|
-
if (current === null || current === undefined)
|
|
732
|
-
return '';
|
|
733
|
-
current = current[part];
|
|
734
|
-
}
|
|
1017
|
+
const current = get(attributes, varPath);
|
|
735
1018
|
if (current === null || current === undefined)
|
|
736
1019
|
return '';
|
|
737
1020
|
return typeof current === 'string' ? current : JSON.stringify(current);
|
|
@@ -751,25 +1034,170 @@ function resolveSet(setObj, attributes) {
|
|
|
751
1034
|
}
|
|
752
1035
|
return result;
|
|
753
1036
|
}
|
|
1037
|
+
/**
|
|
1038
|
+
* Create the lib object for JsonMap transformations.
|
|
1039
|
+
* Provides utility functions for path manipulation.
|
|
1040
|
+
*
|
|
1041
|
+
* @returns The lib object.
|
|
1042
|
+
*/
|
|
1043
|
+
function createJsonMapLib() {
|
|
1044
|
+
return {
|
|
1045
|
+
split: (str, separator) => str.split(separator),
|
|
1046
|
+
slice: (arr, start, end) => arr.slice(start, end),
|
|
1047
|
+
join: (arr, separator) => arr.join(separator),
|
|
1048
|
+
toLowerCase: (str) => str.toLowerCase(),
|
|
1049
|
+
replace: (str, search, replacement) => str.replace(search, replacement),
|
|
1050
|
+
get: (obj, path) => get(obj, path),
|
|
1051
|
+
};
|
|
1052
|
+
}
|
|
754
1053
|
/**
|
|
755
1054
|
* Apply compiled inference rules to file attributes, returning merged metadata.
|
|
756
1055
|
*
|
|
757
1056
|
* Rules are evaluated in order; later rules override earlier ones.
|
|
1057
|
+
* If a rule has a `map`, the JsonMap transformation is applied after `set` resolution,
|
|
1058
|
+
* and map output overrides set output on conflict.
|
|
758
1059
|
*
|
|
759
1060
|
* @param compiledRules - The compiled rules to evaluate.
|
|
760
1061
|
* @param attributes - The file attributes to match against.
|
|
1062
|
+
* @param namedMaps - Optional record of named JsonMap definitions.
|
|
1063
|
+
* @param logger - Optional pino logger for warnings (falls back to console.warn).
|
|
761
1064
|
* @returns The merged metadata from all matching rules.
|
|
762
1065
|
*/
|
|
763
|
-
function applyRules(compiledRules, attributes) {
|
|
1066
|
+
async function applyRules(compiledRules, attributes, namedMaps, logger) {
|
|
1067
|
+
// JsonMap's type definitions expect a generic JsonMapLib shape with unary functions.
|
|
1068
|
+
// Our helper functions accept multiple args, which JsonMap supports at runtime.
|
|
1069
|
+
const lib = createJsonMapLib();
|
|
764
1070
|
let merged = {};
|
|
1071
|
+
const log = logger ?? console;
|
|
765
1072
|
for (const { rule, validate } of compiledRules) {
|
|
766
1073
|
if (validate(attributes)) {
|
|
767
|
-
|
|
1074
|
+
// Apply set resolution
|
|
1075
|
+
const setOutput = resolveSet(rule.set, attributes);
|
|
1076
|
+
merged = { ...merged, ...setOutput };
|
|
1077
|
+
// Apply map transformation if present
|
|
1078
|
+
if (rule.map) {
|
|
1079
|
+
let mapDef;
|
|
1080
|
+
// Resolve map reference
|
|
1081
|
+
if (typeof rule.map === 'string') {
|
|
1082
|
+
mapDef = namedMaps?.[rule.map];
|
|
1083
|
+
if (!mapDef) {
|
|
1084
|
+
log.warn(`Map reference "${rule.map}" not found in named maps. Skipping map transformation.`);
|
|
1085
|
+
continue;
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
else {
|
|
1089
|
+
mapDef = rule.map;
|
|
1090
|
+
}
|
|
1091
|
+
// Execute JsonMap transformation
|
|
1092
|
+
try {
|
|
1093
|
+
const jsonMap = new JsonMap(mapDef, lib);
|
|
1094
|
+
const mapOutput = await jsonMap.transform(attributes);
|
|
1095
|
+
if (mapOutput &&
|
|
1096
|
+
typeof mapOutput === 'object' &&
|
|
1097
|
+
!Array.isArray(mapOutput)) {
|
|
1098
|
+
merged = { ...merged, ...mapOutput };
|
|
1099
|
+
}
|
|
1100
|
+
else {
|
|
1101
|
+
log.warn(`JsonMap transformation did not return an object; skipping merge.`);
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
catch (error) {
|
|
1105
|
+
log.warn(`JsonMap transformation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
768
1108
|
}
|
|
769
1109
|
}
|
|
770
1110
|
return merged;
|
|
771
1111
|
}
|
|
772
1112
|
|
|
1113
|
+
/**
|
|
1114
|
+
* @module processor/buildMetadata
|
|
1115
|
+
* Builds merged metadata from file content, inference rules, and enrichment. I/O: reads files, extracts text, loads enrichment .meta.json.
|
|
1116
|
+
*/
|
|
1117
|
+
/**
|
|
1118
|
+
* Build merged metadata for a file by applying inference rules and merging with enrichment metadata.
|
|
1119
|
+
*
|
|
1120
|
+
* @param filePath - The file to process.
|
|
1121
|
+
* @param compiledRules - The compiled inference rules.
|
|
1122
|
+
* @param metadataDir - The metadata directory for enrichment files.
|
|
1123
|
+
* @param maps - Optional named JsonMap definitions.
|
|
1124
|
+
* @param logger - Optional logger for rule warnings.
|
|
1125
|
+
* @returns The merged metadata and intermediate data.
|
|
1126
|
+
*/
|
|
1127
|
+
async function buildMergedMetadata(filePath, compiledRules, metadataDir, maps, logger) {
|
|
1128
|
+
const ext = extname(filePath);
|
|
1129
|
+
const stats = await stat(filePath);
|
|
1130
|
+
// 1. Extract text and structured data
|
|
1131
|
+
const extracted = await extractText(filePath, ext);
|
|
1132
|
+
// 2. Build attributes + apply rules
|
|
1133
|
+
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
1134
|
+
const inferred = await applyRules(compiledRules, attributes, maps, logger);
|
|
1135
|
+
// 3. Read enrichment metadata (merge, enrichment wins)
|
|
1136
|
+
const enrichment = await readMetadata(filePath, metadataDir);
|
|
1137
|
+
const metadata = {
|
|
1138
|
+
...inferred,
|
|
1139
|
+
...(enrichment ?? {}),
|
|
1140
|
+
};
|
|
1141
|
+
return { inferred, enrichment, metadata, attributes, extracted };
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
/**
|
|
1145
|
+
* @module processor/chunkIds
|
|
1146
|
+
* Generates chunk point IDs from file paths and chunk indices. Extracts chunk counts from Qdrant payloads. Pure functions, no I/O.
|
|
1147
|
+
*/
|
|
1148
|
+
/**
|
|
1149
|
+
* Generate an array of chunk IDs for a file.
|
|
1150
|
+
*
|
|
1151
|
+
* @param filePath - The file path.
|
|
1152
|
+
* @param totalChunks - The total number of chunks.
|
|
1153
|
+
* @returns An array of point IDs for each chunk.
|
|
1154
|
+
*/
|
|
1155
|
+
function chunkIds(filePath, totalChunks) {
|
|
1156
|
+
const ids = [];
|
|
1157
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
1158
|
+
ids.push(pointId(filePath, i));
|
|
1159
|
+
}
|
|
1160
|
+
return ids;
|
|
1161
|
+
}
|
|
1162
|
+
/**
|
|
1163
|
+
* Extract the total chunk count from a payload, with a fallback.
|
|
1164
|
+
*
|
|
1165
|
+
* @param payload - The Qdrant point payload (or null).
|
|
1166
|
+
* @param fallback - The fallback value if total_chunks is missing or invalid.
|
|
1167
|
+
* @returns The total chunk count.
|
|
1168
|
+
*/
|
|
1169
|
+
function getChunkCount(payload, fallback = 1) {
|
|
1170
|
+
if (!payload)
|
|
1171
|
+
return fallback;
|
|
1172
|
+
const count = payload['total_chunks'];
|
|
1173
|
+
return typeof count === 'number' ? count : fallback;
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
/**
|
|
1177
|
+
* @module processor/splitter
|
|
1178
|
+
* Factory for LangChain text splitters. Returns MarkdownTextSplitter or RecursiveCharacterTextSplitter based on file extension. No I/O.
|
|
1179
|
+
*/
|
|
1180
|
+
/**
|
|
1181
|
+
* Create the appropriate text splitter for the given file extension.
|
|
1182
|
+
*
|
|
1183
|
+
* @param ext - File extension (including leading dot).
|
|
1184
|
+
* @param chunkSize - Maximum chunk size in characters.
|
|
1185
|
+
* @param chunkOverlap - Overlap between chunks in characters.
|
|
1186
|
+
* @returns A text splitter instance.
|
|
1187
|
+
*/
|
|
1188
|
+
function createSplitter(ext, chunkSize, chunkOverlap) {
|
|
1189
|
+
const lowerExt = ext.toLowerCase();
|
|
1190
|
+
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1191
|
+
return new MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1192
|
+
}
|
|
1193
|
+
return new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
/**
|
|
1197
|
+
* @module processor
|
|
1198
|
+
*
|
|
1199
|
+
* Core document processing pipeline. Handles extracting text, computing embeddings, syncing with vector store.
|
|
1200
|
+
*/
|
|
773
1201
|
/**
|
|
774
1202
|
* Core document processing pipeline.
|
|
775
1203
|
*
|
|
@@ -781,11 +1209,10 @@ class DocumentProcessor {
|
|
|
781
1209
|
vectorStore;
|
|
782
1210
|
compiledRules;
|
|
783
1211
|
logger;
|
|
784
|
-
metadataDir;
|
|
785
1212
|
/**
|
|
786
1213
|
* Create a new DocumentProcessor.
|
|
787
1214
|
*
|
|
788
|
-
* @param config - The
|
|
1215
|
+
* @param config - The processor configuration.
|
|
789
1216
|
* @param embeddingProvider - The embedding provider.
|
|
790
1217
|
* @param vectorStore - The vector store client.
|
|
791
1218
|
* @param compiledRules - The compiled inference rules.
|
|
@@ -797,7 +1224,6 @@ class DocumentProcessor {
|
|
|
797
1224
|
this.vectorStore = vectorStore;
|
|
798
1225
|
this.compiledRules = compiledRules;
|
|
799
1226
|
this.logger = logger;
|
|
800
|
-
this.metadataDir = config.metadataDir ?? '.jeeves-metadata';
|
|
801
1227
|
}
|
|
802
1228
|
/**
|
|
803
1229
|
* Process a file through the full pipeline: extract, hash, chunk, embed, upsert.
|
|
@@ -807,9 +1233,8 @@ class DocumentProcessor {
|
|
|
807
1233
|
async processFile(filePath) {
|
|
808
1234
|
try {
|
|
809
1235
|
const ext = extname(filePath);
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
const extracted = await extractText(filePath, ext);
|
|
1236
|
+
// 1. Build merged metadata + extract text
|
|
1237
|
+
const { metadata, extracted } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
813
1238
|
if (!extracted.text.trim()) {
|
|
814
1239
|
this.logger.debug({ filePath }, 'Skipping empty file');
|
|
815
1240
|
return;
|
|
@@ -822,26 +1247,15 @@ class DocumentProcessor {
|
|
|
822
1247
|
this.logger.debug({ filePath }, 'Content unchanged, skipping');
|
|
823
1248
|
return;
|
|
824
1249
|
}
|
|
825
|
-
const oldTotalChunks =
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
const
|
|
830
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
831
|
-
// 4. Read enrichment metadata (merge, enrichment wins)
|
|
832
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
833
|
-
const metadata = {
|
|
834
|
-
...inferred,
|
|
835
|
-
...(enrichment ?? {}),
|
|
836
|
-
};
|
|
837
|
-
// 5. Chunk text
|
|
838
|
-
const chunkSize = this.config.embedding.chunkSize ?? 1000;
|
|
839
|
-
const chunkOverlap = this.config.embedding.chunkOverlap ?? 200;
|
|
840
|
-
const splitter = this.createSplitter(ext, chunkSize, chunkOverlap);
|
|
1250
|
+
const oldTotalChunks = getChunkCount(existingPayload);
|
|
1251
|
+
// 3. Chunk text
|
|
1252
|
+
const chunkSize = this.config.chunkSize ?? 1000;
|
|
1253
|
+
const chunkOverlap = this.config.chunkOverlap ?? 200;
|
|
1254
|
+
const splitter = createSplitter(ext, chunkSize, chunkOverlap);
|
|
841
1255
|
const chunks = await splitter.splitText(extracted.text);
|
|
842
|
-
//
|
|
1256
|
+
// 4. Embed all chunks
|
|
843
1257
|
const vectors = await this.embeddingProvider.embed(chunks);
|
|
844
|
-
//
|
|
1258
|
+
// 5. Upsert all chunk points
|
|
845
1259
|
const points = chunks.map((chunk, i) => ({
|
|
846
1260
|
id: pointId(filePath, i),
|
|
847
1261
|
vector: vectors[i],
|
|
@@ -855,12 +1269,9 @@ class DocumentProcessor {
|
|
|
855
1269
|
},
|
|
856
1270
|
}));
|
|
857
1271
|
await this.vectorStore.upsert(points);
|
|
858
|
-
//
|
|
1272
|
+
// 6. Clean up orphaned chunks
|
|
859
1273
|
if (oldTotalChunks > chunks.length) {
|
|
860
|
-
const orphanIds =
|
|
861
|
-
for (let i = chunks.length; i < oldTotalChunks; i++) {
|
|
862
|
-
orphanIds.push(pointId(filePath, i));
|
|
863
|
-
}
|
|
1274
|
+
const orphanIds = chunkIds(filePath, oldTotalChunks).slice(chunks.length);
|
|
864
1275
|
await this.vectorStore.delete(orphanIds);
|
|
865
1276
|
}
|
|
866
1277
|
this.logger.info({ filePath, chunks: chunks.length }, 'File processed successfully');
|
|
@@ -879,15 +1290,10 @@ class DocumentProcessor {
|
|
|
879
1290
|
// Get the existing payload to find total chunks
|
|
880
1291
|
const baseId = pointId(filePath, 0);
|
|
881
1292
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
882
|
-
const totalChunks =
|
|
883
|
-
|
|
884
|
-
: 1;
|
|
885
|
-
const ids = [];
|
|
886
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
887
|
-
ids.push(pointId(filePath, i));
|
|
888
|
-
}
|
|
1293
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1294
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
889
1295
|
await this.vectorStore.delete(ids);
|
|
890
|
-
await deleteMetadata(filePath, this.metadataDir);
|
|
1296
|
+
await deleteMetadata(filePath, this.config.metadataDir);
|
|
891
1297
|
this.logger.info({ filePath }, 'File deleted from index');
|
|
892
1298
|
}
|
|
893
1299
|
catch (error) {
|
|
@@ -904,21 +1310,16 @@ class DocumentProcessor {
|
|
|
904
1310
|
async processMetadataUpdate(filePath, metadata) {
|
|
905
1311
|
try {
|
|
906
1312
|
// Read existing enrichment metadata and merge
|
|
907
|
-
const existing = (await readMetadata(filePath, this.metadataDir)) ?? {};
|
|
1313
|
+
const existing = (await readMetadata(filePath, this.config.metadataDir)) ?? {};
|
|
908
1314
|
const merged = { ...existing, ...metadata };
|
|
909
|
-
await writeMetadata(filePath, this.metadataDir, merged);
|
|
1315
|
+
await writeMetadata(filePath, this.config.metadataDir, merged);
|
|
910
1316
|
// Update all chunk payloads in Qdrant
|
|
911
1317
|
const baseId = pointId(filePath, 0);
|
|
912
1318
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
913
1319
|
if (!existingPayload)
|
|
914
1320
|
return null;
|
|
915
|
-
const totalChunks =
|
|
916
|
-
|
|
917
|
-
: 1;
|
|
918
|
-
const ids = [];
|
|
919
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
920
|
-
ids.push(pointId(filePath, i));
|
|
921
|
-
}
|
|
1321
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1322
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
922
1323
|
await this.vectorStore.setPayload(ids, merged);
|
|
923
1324
|
this.logger.info({ filePath, chunks: totalChunks }, 'Metadata updated');
|
|
924
1325
|
return merged;
|
|
@@ -944,27 +1345,11 @@ class DocumentProcessor {
|
|
|
944
1345
|
this.logger.debug({ filePath }, 'File not indexed, skipping');
|
|
945
1346
|
return null;
|
|
946
1347
|
}
|
|
947
|
-
|
|
948
|
-
const
|
|
949
|
-
// Extract frontmatter/json for attribute building (lightweight)
|
|
950
|
-
const extracted = await extractText(filePath, ext);
|
|
951
|
-
// Build attributes + apply current rules
|
|
952
|
-
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
953
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
954
|
-
// Read enrichment metadata (merge, enrichment wins)
|
|
955
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
956
|
-
const metadata = {
|
|
957
|
-
...inferred,
|
|
958
|
-
...(enrichment ?? {}),
|
|
959
|
-
};
|
|
1348
|
+
// Build merged metadata (lightweight — no embedding)
|
|
1349
|
+
const { metadata } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
960
1350
|
// Update all chunk payloads
|
|
961
|
-
const totalChunks =
|
|
962
|
-
|
|
963
|
-
: 1;
|
|
964
|
-
const ids = [];
|
|
965
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
966
|
-
ids.push(pointId(filePath, i));
|
|
967
|
-
}
|
|
1351
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1352
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
968
1353
|
await this.vectorStore.setPayload(ids, metadata);
|
|
969
1354
|
this.logger.info({ filePath, chunks: totalChunks }, 'Rules re-applied');
|
|
970
1355
|
return metadata;
|
|
@@ -983,23 +1368,12 @@ class DocumentProcessor {
|
|
|
983
1368
|
this.compiledRules = compiledRules;
|
|
984
1369
|
this.logger.info({ rules: compiledRules.length }, 'Inference rules updated');
|
|
985
1370
|
}
|
|
986
|
-
/**
|
|
987
|
-
* Create the appropriate text splitter for the given file extension.
|
|
988
|
-
*
|
|
989
|
-
* @param ext - File extension.
|
|
990
|
-
* @param chunkSize - Maximum chunk size in characters.
|
|
991
|
-
* @param chunkOverlap - Overlap between chunks in characters.
|
|
992
|
-
* @returns A text splitter instance.
|
|
993
|
-
*/
|
|
994
|
-
createSplitter(ext, chunkSize, chunkOverlap) {
|
|
995
|
-
const lowerExt = ext.toLowerCase();
|
|
996
|
-
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
997
|
-
return new MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
998
|
-
}
|
|
999
|
-
return new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1000
|
-
}
|
|
1001
1371
|
}
|
|
1002
1372
|
|
|
1373
|
+
/**
|
|
1374
|
+
* @module queue
|
|
1375
|
+
* Debounced, rate-limited, concurrent event queue for file watchers. Manages priority queuing and async callbacks. No direct I/O; orchestrates processing.
|
|
1376
|
+
*/
|
|
1003
1377
|
/**
|
|
1004
1378
|
* A debounced, rate-limited, concurrent event queue.
|
|
1005
1379
|
*/
|
|
@@ -1148,19 +1522,23 @@ class VectorStoreClient {
|
|
|
1148
1522
|
client;
|
|
1149
1523
|
collectionName;
|
|
1150
1524
|
dims;
|
|
1525
|
+
logger;
|
|
1151
1526
|
/**
|
|
1152
1527
|
* Create a new VectorStoreClient.
|
|
1153
1528
|
*
|
|
1154
1529
|
* @param config - Vector store configuration.
|
|
1155
1530
|
* @param dimensions - The embedding vector dimensions.
|
|
1531
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
1156
1532
|
*/
|
|
1157
|
-
constructor(config, dimensions) {
|
|
1533
|
+
constructor(config, dimensions, logger) {
|
|
1158
1534
|
this.client = new QdrantClient({
|
|
1159
1535
|
url: config.url,
|
|
1160
1536
|
apiKey: config.apiKey,
|
|
1537
|
+
checkCompatibility: false,
|
|
1161
1538
|
});
|
|
1162
1539
|
this.collectionName = config.collectionName;
|
|
1163
1540
|
this.dims = dimensions;
|
|
1541
|
+
this.logger = logger;
|
|
1164
1542
|
}
|
|
1165
1543
|
/**
|
|
1166
1544
|
* Ensure the collection exists with correct dimensions and Cosine distance.
|
|
@@ -1187,13 +1565,42 @@ class VectorStoreClient {
|
|
|
1187
1565
|
async upsert(points) {
|
|
1188
1566
|
if (points.length === 0)
|
|
1189
1567
|
return;
|
|
1190
|
-
await
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1568
|
+
await retry(async (attempt) => {
|
|
1569
|
+
if (attempt > 1) {
|
|
1570
|
+
const msg = {
|
|
1571
|
+
attempt,
|
|
1572
|
+
operation: 'qdrant.upsert',
|
|
1573
|
+
points: points.length,
|
|
1574
|
+
};
|
|
1575
|
+
if (this.logger) {
|
|
1576
|
+
this.logger.warn(msg, 'Retrying Qdrant upsert');
|
|
1577
|
+
}
|
|
1578
|
+
else {
|
|
1579
|
+
console.warn(msg, 'Retrying Qdrant upsert');
|
|
1580
|
+
}
|
|
1581
|
+
}
|
|
1582
|
+
await this.client.upsert(this.collectionName, {
|
|
1583
|
+
wait: true,
|
|
1584
|
+
points: points.map((p) => ({
|
|
1585
|
+
id: p.id,
|
|
1586
|
+
vector: p.vector,
|
|
1587
|
+
payload: p.payload,
|
|
1588
|
+
})),
|
|
1589
|
+
});
|
|
1590
|
+
}, {
|
|
1591
|
+
attempts: 5,
|
|
1592
|
+
baseDelayMs: 500,
|
|
1593
|
+
maxDelayMs: 10_000,
|
|
1594
|
+
jitter: 0.2,
|
|
1595
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1596
|
+
const msg = { attempt, delayMs, operation: 'qdrant.upsert', error };
|
|
1597
|
+
if (this.logger) {
|
|
1598
|
+
this.logger.warn(msg, 'Qdrant upsert failed; will retry');
|
|
1599
|
+
}
|
|
1600
|
+
else {
|
|
1601
|
+
console.warn(msg, 'Qdrant upsert failed; will retry');
|
|
1602
|
+
}
|
|
1603
|
+
},
|
|
1197
1604
|
});
|
|
1198
1605
|
}
|
|
1199
1606
|
/**
|
|
@@ -1204,9 +1611,38 @@ class VectorStoreClient {
|
|
|
1204
1611
|
async delete(ids) {
|
|
1205
1612
|
if (ids.length === 0)
|
|
1206
1613
|
return;
|
|
1207
|
-
await
|
|
1208
|
-
|
|
1209
|
-
|
|
1614
|
+
await retry(async (attempt) => {
|
|
1615
|
+
if (attempt > 1) {
|
|
1616
|
+
const msg = {
|
|
1617
|
+
attempt,
|
|
1618
|
+
operation: 'qdrant.delete',
|
|
1619
|
+
ids: ids.length,
|
|
1620
|
+
};
|
|
1621
|
+
if (this.logger) {
|
|
1622
|
+
this.logger.warn(msg, 'Retrying Qdrant delete');
|
|
1623
|
+
}
|
|
1624
|
+
else {
|
|
1625
|
+
console.warn(msg, 'Retrying Qdrant delete');
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
await this.client.delete(this.collectionName, {
|
|
1629
|
+
wait: true,
|
|
1630
|
+
points: ids,
|
|
1631
|
+
});
|
|
1632
|
+
}, {
|
|
1633
|
+
attempts: 5,
|
|
1634
|
+
baseDelayMs: 500,
|
|
1635
|
+
maxDelayMs: 10_000,
|
|
1636
|
+
jitter: 0.2,
|
|
1637
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1638
|
+
const msg = { attempt, delayMs, operation: 'qdrant.delete', error };
|
|
1639
|
+
if (this.logger) {
|
|
1640
|
+
this.logger.warn(msg, 'Qdrant delete failed; will retry');
|
|
1641
|
+
}
|
|
1642
|
+
else {
|
|
1643
|
+
console.warn(msg, 'Qdrant delete failed; will retry');
|
|
1644
|
+
}
|
|
1645
|
+
},
|
|
1210
1646
|
});
|
|
1211
1647
|
}
|
|
1212
1648
|
/**
|
|
@@ -1306,6 +1742,10 @@ class VectorStoreClient {
|
|
|
1306
1742
|
}
|
|
1307
1743
|
}
|
|
1308
1744
|
|
|
1745
|
+
/**
|
|
1746
|
+
* @module watcher
|
|
1747
|
+
* Filesystem watcher wrapping chokidar. I/O: watches files/directories for add/change/unlink events, enqueues to processing queue.
|
|
1748
|
+
*/
|
|
1309
1749
|
/**
|
|
1310
1750
|
* Filesystem watcher that maps chokidar events to the processing queue.
|
|
1311
1751
|
*/
|
|
@@ -1403,16 +1843,22 @@ class JeevesWatcher {
|
|
|
1403
1843
|
this.logger = logger;
|
|
1404
1844
|
let embeddingProvider;
|
|
1405
1845
|
try {
|
|
1406
|
-
embeddingProvider = createEmbeddingProvider(this.config.embedding);
|
|
1846
|
+
embeddingProvider = createEmbeddingProvider(this.config.embedding, logger);
|
|
1407
1847
|
}
|
|
1408
1848
|
catch (error) {
|
|
1409
1849
|
logger.fatal({ error }, 'Failed to create embedding provider');
|
|
1410
1850
|
throw error;
|
|
1411
1851
|
}
|
|
1412
|
-
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions);
|
|
1852
|
+
const vectorStore = new VectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions, logger);
|
|
1413
1853
|
await vectorStore.ensureCollection();
|
|
1414
1854
|
const compiledRules = compileRules(this.config.inferenceRules ?? []);
|
|
1415
|
-
const
|
|
1855
|
+
const processorConfig = {
|
|
1856
|
+
metadataDir: this.config.metadataDir ?? '.jeeves-metadata',
|
|
1857
|
+
chunkSize: this.config.embedding.chunkSize,
|
|
1858
|
+
chunkOverlap: this.config.embedding.chunkOverlap,
|
|
1859
|
+
maps: this.config.maps,
|
|
1860
|
+
};
|
|
1861
|
+
const processor = new DocumentProcessor(processorConfig, embeddingProvider, vectorStore, compiledRules, logger);
|
|
1416
1862
|
this.processor = processor;
|
|
1417
1863
|
const queue = new EventQueue({
|
|
1418
1864
|
debounceMs: this.config.watch.debounceMs ?? 2000,
|
|
@@ -1433,7 +1879,7 @@ class JeevesWatcher {
|
|
|
1433
1879
|
this.server = server;
|
|
1434
1880
|
await server.listen({
|
|
1435
1881
|
host: this.config.api?.host ?? '127.0.0.1',
|
|
1436
|
-
port: this.config.api?.port ??
|
|
1882
|
+
port: this.config.api?.port ?? 3456,
|
|
1437
1883
|
});
|
|
1438
1884
|
watcher.start();
|
|
1439
1885
|
this.startConfigWatch();
|
|
@@ -1449,12 +1895,17 @@ class JeevesWatcher {
|
|
|
1449
1895
|
}
|
|
1450
1896
|
if (this.queue) {
|
|
1451
1897
|
const timeout = this.config.shutdownTimeoutMs ?? 10000;
|
|
1452
|
-
await Promise.race([
|
|
1453
|
-
this.queue.drain(),
|
|
1898
|
+
const drained = await Promise.race([
|
|
1899
|
+
this.queue.drain().then(() => true),
|
|
1454
1900
|
new Promise((resolve) => {
|
|
1455
|
-
setTimeout(
|
|
1901
|
+
setTimeout(() => {
|
|
1902
|
+
resolve(false);
|
|
1903
|
+
}, timeout);
|
|
1456
1904
|
}),
|
|
1457
1905
|
]);
|
|
1906
|
+
if (!drained) {
|
|
1907
|
+
this.logger?.warn({ timeoutMs: timeout }, 'Queue drain timeout hit, forcing shutdown');
|
|
1908
|
+
}
|
|
1458
1909
|
}
|
|
1459
1910
|
if (this.server) {
|
|
1460
1911
|
await this.server.close();
|
|
@@ -1503,6 +1954,7 @@ class JeevesWatcher {
|
|
|
1503
1954
|
const processor = this.processor;
|
|
1504
1955
|
if (!logger || !processor || !this.configPath)
|
|
1505
1956
|
return;
|
|
1957
|
+
logger.info({ configPath: this.configPath }, 'Config change detected, reloading...');
|
|
1506
1958
|
try {
|
|
1507
1959
|
const newConfig = await loadConfig(this.configPath);
|
|
1508
1960
|
this.config = newConfig;
|
|
@@ -1534,4 +1986,4 @@ async function startFromConfig(configPath) {
|
|
|
1534
1986
|
return app;
|
|
1535
1987
|
}
|
|
1536
1988
|
|
|
1537
|
-
export { DocumentProcessor, EventQueue, FileSystemWatcher, JeevesWatcher, VectorStoreClient, applyRules, buildAttributes, compileRules, contentHash, createApiServer, createEmbeddingProvider, createLogger, deleteMetadata, extractText, loadConfig, metadataPath, pointId, readMetadata, startFromConfig, writeMetadata };
|
|
1989
|
+
export { DocumentProcessor, EventQueue, FileSystemWatcher, JeevesWatcher, VectorStoreClient, apiConfigSchema, applyRules, buildAttributes, compileRules, configWatchConfigSchema, contentHash, createApiServer, createEmbeddingProvider, createLogger, deleteMetadata, embeddingConfigSchema, extractText, inferenceRuleSchema, jeevesWatcherConfigSchema, loadConfig, loggingConfigSchema, metadataPath, pointId, readMetadata, startFromConfig, vectorStoreConfigSchema, watchConfigSchema, writeMetadata };
|