@karmaniverous/jeeves-watcher 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -16
- package/config.schema.json +577 -0
- package/dist/cjs/index.js +1120 -517
- package/dist/cli/jeeves-watcher/index.js +1479 -695
- package/dist/index.d.ts +255 -150
- package/dist/index.iife.js +1114 -514
- package/dist/index.iife.min.js +1 -1
- package/dist/mjs/index.js +1115 -520
- package/package.json +28 -22
package/dist/index.iife.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
(function (exports, Fastify,
|
|
1
|
+
(function (exports, Fastify, promises, node_path, picomatch, radash, node_crypto, cosmiconfig, zod, jsonmap, googleGenai, pino, uuid, cheerio, yaml, mammoth, Ajv, addFormats, textsplitters, jsClientRest, chokidar) {
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
function _interopNamespaceDefault(e) {
|
|
@@ -20,73 +20,6 @@
|
|
|
20
20
|
|
|
21
21
|
var cheerio__namespace = /*#__PURE__*/_interopNamespaceDefault(cheerio);
|
|
22
22
|
|
|
23
|
-
/**
|
|
24
|
-
* Normalise a file path for deterministic mapping: lowercase, forward slashes, strip leading drive letter colon.
|
|
25
|
-
*
|
|
26
|
-
* @param filePath - The original file path.
|
|
27
|
-
* @returns The normalised path string.
|
|
28
|
-
*/
|
|
29
|
-
function normalisePath$1(filePath) {
|
|
30
|
-
return filePath
|
|
31
|
-
.replace(/\\/g, '/')
|
|
32
|
-
.replace(/^([A-Za-z]):/, (_m, letter) => letter.toLowerCase())
|
|
33
|
-
.toLowerCase();
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Derive a deterministic `.meta.json` path for a given file.
|
|
37
|
-
*
|
|
38
|
-
* @param filePath - The watched file path.
|
|
39
|
-
* @param metadataDir - The root metadata directory.
|
|
40
|
-
* @returns The full path to the metadata file.
|
|
41
|
-
*/
|
|
42
|
-
function metadataPath(filePath, metadataDir) {
|
|
43
|
-
const normalised = normalisePath$1(filePath);
|
|
44
|
-
const hash = node_crypto.createHash('sha256').update(normalised, 'utf8').digest('hex');
|
|
45
|
-
return node_path.join(metadataDir, `${hash}.meta.json`);
|
|
46
|
-
}
|
|
47
|
-
/**
|
|
48
|
-
* Read persisted metadata for a file.
|
|
49
|
-
*
|
|
50
|
-
* @param filePath - The watched file path.
|
|
51
|
-
* @param metadataDir - The root metadata directory.
|
|
52
|
-
* @returns The parsed metadata object, or `null` if not found.
|
|
53
|
-
*/
|
|
54
|
-
async function readMetadata(filePath, metadataDir) {
|
|
55
|
-
try {
|
|
56
|
-
const raw = await promises.readFile(metadataPath(filePath, metadataDir), 'utf8');
|
|
57
|
-
return JSON.parse(raw);
|
|
58
|
-
}
|
|
59
|
-
catch {
|
|
60
|
-
return null;
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
/**
|
|
64
|
-
* Write metadata for a file.
|
|
65
|
-
*
|
|
66
|
-
* @param filePath - The watched file path.
|
|
67
|
-
* @param metadataDir - The root metadata directory.
|
|
68
|
-
* @param metadata - The metadata to persist.
|
|
69
|
-
*/
|
|
70
|
-
async function writeMetadata(filePath, metadataDir, metadata) {
|
|
71
|
-
const dest = metadataPath(filePath, metadataDir);
|
|
72
|
-
await promises.mkdir(node_path.dirname(dest), { recursive: true });
|
|
73
|
-
await promises.writeFile(dest, JSON.stringify(metadata, null, 2), 'utf8');
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Delete metadata for a file.
|
|
77
|
-
*
|
|
78
|
-
* @param filePath - The watched file path.
|
|
79
|
-
* @param metadataDir - The root metadata directory.
|
|
80
|
-
*/
|
|
81
|
-
async function deleteMetadata(filePath, metadataDir) {
|
|
82
|
-
try {
|
|
83
|
-
await promises.rm(metadataPath(filePath, metadataDir));
|
|
84
|
-
}
|
|
85
|
-
catch {
|
|
86
|
-
// Ignore if file doesn't exist.
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
23
|
/**
|
|
91
24
|
* Best-effort base directory inference for a glob pattern.
|
|
92
25
|
*
|
|
@@ -160,233 +93,536 @@
|
|
|
160
93
|
}
|
|
161
94
|
|
|
162
95
|
/**
|
|
163
|
-
*
|
|
96
|
+
* @module processAllFiles
|
|
164
97
|
*
|
|
165
|
-
*
|
|
98
|
+
* Shared helper for processing all files matching configured globs.
|
|
99
|
+
*/
|
|
100
|
+
/**
|
|
101
|
+
* Process all files from globs using the specified processor method.
|
|
166
102
|
*
|
|
167
|
-
* @param
|
|
168
|
-
* @
|
|
103
|
+
* @param watchPaths - The glob patterns to match.
|
|
104
|
+
* @param ignoredPaths - The glob patterns to ignore.
|
|
105
|
+
* @param processor - The document processor instance.
|
|
106
|
+
* @param method - The processor method to call ('processFile' or 'processRulesUpdate').
|
|
107
|
+
* @returns The number of files processed.
|
|
169
108
|
*/
|
|
170
|
-
function
|
|
171
|
-
const
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
}
|
|
177
|
-
|
|
109
|
+
async function processAllFiles(watchPaths, ignoredPaths, processor, method) {
|
|
110
|
+
const files = await listFilesFromGlobs(watchPaths, ignoredPaths);
|
|
111
|
+
for (const file of files) {
|
|
112
|
+
// Sequential on purpose to avoid surprising load.
|
|
113
|
+
// Queue integration can come later.
|
|
114
|
+
await processor[method](file);
|
|
115
|
+
}
|
|
116
|
+
return files.length;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* @module api/handlers/configReindex
|
|
121
|
+
* Fastify route handler for POST /config-reindex. Triggers an async reindex job scoped to rules or full processing.
|
|
122
|
+
*/
|
|
123
|
+
/**
|
|
124
|
+
* Create handler for POST /config-reindex.
|
|
125
|
+
*
|
|
126
|
+
* @param deps - Route dependencies.
|
|
127
|
+
*/
|
|
128
|
+
function createConfigReindexHandler(deps) {
|
|
129
|
+
return async (request, reply) => {
|
|
178
130
|
try {
|
|
179
|
-
const
|
|
180
|
-
|
|
181
|
-
|
|
131
|
+
const scope = request.body.scope ?? 'rules';
|
|
132
|
+
// Return immediately and run async
|
|
133
|
+
void (async () => {
|
|
134
|
+
try {
|
|
135
|
+
if (scope === 'rules') {
|
|
136
|
+
const count = await processAllFiles(deps.config.watch.paths, deps.config.watch.ignored, deps.processor, 'processRulesUpdate');
|
|
137
|
+
deps.logger.info({ scope, filesProcessed: count }, 'Config reindex (rules) completed');
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
const count = await processAllFiles(deps.config.watch.paths, deps.config.watch.ignored, deps.processor, 'processFile');
|
|
141
|
+
deps.logger.info({ scope, filesProcessed: count }, 'Config reindex (full) completed');
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
catch (error) {
|
|
145
|
+
deps.logger.error({ error, scope }, 'Config reindex failed');
|
|
146
|
+
}
|
|
147
|
+
})();
|
|
148
|
+
return await reply.status(200).send({ status: 'started', scope });
|
|
182
149
|
}
|
|
183
150
|
catch (error) {
|
|
184
|
-
logger.error({ error }, '
|
|
185
|
-
return reply.status(500).send({ error: 'Internal server error' });
|
|
151
|
+
deps.logger.error({ error }, 'Config reindex request failed');
|
|
152
|
+
return await reply.status(500).send({ error: 'Internal server error' });
|
|
186
153
|
}
|
|
187
|
-
}
|
|
188
|
-
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* @module api/handlers/metadata
|
|
159
|
+
* Fastify route handler for POST /metadata. Performs enrichment metadata updates via the document processor.
|
|
160
|
+
*/
|
|
161
|
+
/**
|
|
162
|
+
* Create handler for POST /metadata.
|
|
163
|
+
*
|
|
164
|
+
* @param deps - Route dependencies.
|
|
165
|
+
*/
|
|
166
|
+
function createMetadataHandler(deps) {
|
|
167
|
+
return async (request, reply) => {
|
|
189
168
|
try {
|
|
190
|
-
const {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
return results;
|
|
169
|
+
const { path, metadata } = request.body;
|
|
170
|
+
await deps.processor.processMetadataUpdate(path, metadata);
|
|
171
|
+
return { ok: true };
|
|
194
172
|
}
|
|
195
173
|
catch (error) {
|
|
196
|
-
logger.error({ error }, '
|
|
174
|
+
deps.logger.error({ error }, 'Metadata update failed');
|
|
197
175
|
return reply.status(500).send({ error: 'Internal server error' });
|
|
198
176
|
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* @module util/normalizePath
|
|
182
|
+
* Normalizes file paths for deterministic mapping: lowercase, forward slashes, optional drive letter stripping.
|
|
183
|
+
*/
|
|
184
|
+
/**
|
|
185
|
+
* Normalize a file path: lowercase, forward slashes, optionally strip drive letter colon.
|
|
186
|
+
*
|
|
187
|
+
* @param filePath - The original file path.
|
|
188
|
+
* @param stripDriveLetter - Whether to strip the colon from a leading drive letter (e.g. `C:` → `c`).
|
|
189
|
+
* @returns The normalized path string.
|
|
190
|
+
*/
|
|
191
|
+
function normalizePath(filePath, stripDriveLetter = false) {
|
|
192
|
+
let result = filePath.replace(/\\/g, '/').toLowerCase();
|
|
193
|
+
if (stripDriveLetter) {
|
|
194
|
+
result = result.replace(/^([a-z]):/, (_m, letter) => letter);
|
|
195
|
+
}
|
|
196
|
+
return result;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* @module metadata/metadata
|
|
201
|
+
* Persists file metadata as .meta.json. I/O: reads/writes/deletes metadata files under metadataDir. Path mapping via SHA-256 hash.
|
|
202
|
+
*/
|
|
203
|
+
/**
|
|
204
|
+
* Derive a deterministic `.meta.json` path for a given file.
|
|
205
|
+
*
|
|
206
|
+
* @param filePath - The watched file path.
|
|
207
|
+
* @param metadataDir - The root metadata directory.
|
|
208
|
+
* @returns The full path to the metadata file.
|
|
209
|
+
*/
|
|
210
|
+
function metadataPath(filePath, metadataDir) {
|
|
211
|
+
const normalised = normalizePath(filePath, true);
|
|
212
|
+
const hash = node_crypto.createHash('sha256').update(normalised, 'utf8').digest('hex');
|
|
213
|
+
return node_path.join(metadataDir, `${hash}.meta.json`);
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Read persisted metadata for a file.
|
|
217
|
+
*
|
|
218
|
+
* @param filePath - The watched file path.
|
|
219
|
+
* @param metadataDir - The root metadata directory.
|
|
220
|
+
* @returns The parsed metadata object, or `null` if not found.
|
|
221
|
+
*/
|
|
222
|
+
async function readMetadata(filePath, metadataDir) {
|
|
223
|
+
try {
|
|
224
|
+
const raw = await promises.readFile(metadataPath(filePath, metadataDir), 'utf8');
|
|
225
|
+
return JSON.parse(raw);
|
|
226
|
+
}
|
|
227
|
+
catch {
|
|
228
|
+
return null;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Write metadata for a file.
|
|
233
|
+
*
|
|
234
|
+
* @param filePath - The watched file path.
|
|
235
|
+
* @param metadataDir - The root metadata directory.
|
|
236
|
+
* @param metadata - The metadata to persist.
|
|
237
|
+
*/
|
|
238
|
+
async function writeMetadata(filePath, metadataDir, metadata) {
|
|
239
|
+
const dest = metadataPath(filePath, metadataDir);
|
|
240
|
+
await promises.mkdir(node_path.dirname(dest), { recursive: true });
|
|
241
|
+
await promises.writeFile(dest, JSON.stringify(metadata, null, 2), 'utf8');
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Delete metadata for a file.
|
|
245
|
+
*
|
|
246
|
+
* @param filePath - The watched file path.
|
|
247
|
+
* @param metadataDir - The root metadata directory.
|
|
248
|
+
*/
|
|
249
|
+
async function deleteMetadata(filePath, metadataDir) {
|
|
250
|
+
try {
|
|
251
|
+
await promises.rm(metadataPath(filePath, metadataDir));
|
|
252
|
+
}
|
|
253
|
+
catch {
|
|
254
|
+
// Ignore if file doesn't exist.
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* @module metadata/constants
|
|
260
|
+
* Shared constants for metadata key classification. System keys are injected by the indexing pipeline, not user-provided.
|
|
261
|
+
*/
|
|
262
|
+
/** Keys managed by the indexing pipeline (not user enrichment). */
|
|
263
|
+
const SYSTEM_METADATA_KEYS = [
|
|
264
|
+
'file_path',
|
|
265
|
+
'chunk_index',
|
|
266
|
+
'total_chunks',
|
|
267
|
+
'content_hash',
|
|
268
|
+
'chunk_text',
|
|
269
|
+
];
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* @module api/handlers/rebuildMetadata
|
|
273
|
+
* Fastify route handler for POST /rebuild-metadata. Recreates enrichment metadata files from vector store payloads.
|
|
274
|
+
*/
|
|
275
|
+
/**
|
|
276
|
+
* Create handler for POST /rebuild-metadata.
|
|
277
|
+
*
|
|
278
|
+
* @param deps - Route dependencies.
|
|
279
|
+
*/
|
|
280
|
+
function createRebuildMetadataHandler(deps) {
|
|
281
|
+
return async (_request, reply) => {
|
|
218
282
|
try {
|
|
219
|
-
const metadataDir =
|
|
220
|
-
|
|
283
|
+
const metadataDir = deps.config.metadataDir ?? '.jeeves-metadata';
|
|
284
|
+
const systemKeys = [...SYSTEM_METADATA_KEYS];
|
|
285
|
+
for await (const point of deps.vectorStore.scroll()) {
|
|
221
286
|
const payload = point.payload;
|
|
222
287
|
const filePath = payload['file_path'];
|
|
223
288
|
if (typeof filePath !== 'string' || filePath.length === 0)
|
|
224
289
|
continue;
|
|
225
290
|
// Persist only enrichment-ish fields, not chunking/index fields.
|
|
226
|
-
const
|
|
227
|
-
|
|
228
|
-
delete rest.chunk_index;
|
|
229
|
-
delete rest.total_chunks;
|
|
230
|
-
delete rest.content_hash;
|
|
231
|
-
delete rest.chunk_text;
|
|
232
|
-
await writeMetadata(filePath, metadataDir, rest);
|
|
291
|
+
const enrichment = radash.omit(payload, systemKeys);
|
|
292
|
+
await writeMetadata(filePath, metadataDir, enrichment);
|
|
233
293
|
}
|
|
234
294
|
return await reply.status(200).send({ ok: true });
|
|
235
295
|
}
|
|
236
296
|
catch (error) {
|
|
237
|
-
logger.error({ error }, 'Rebuild metadata failed');
|
|
297
|
+
deps.logger.error({ error }, 'Rebuild metadata failed');
|
|
238
298
|
return await reply.status(500).send({ error: 'Internal server error' });
|
|
239
299
|
}
|
|
240
|
-
}
|
|
241
|
-
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* @module api/handlers/reindex
|
|
305
|
+
* Fastify route handler for POST /reindex. Reprocesses all watched files through the processor.
|
|
306
|
+
*/
|
|
307
|
+
/**
|
|
308
|
+
* Create handler for POST /reindex.
|
|
309
|
+
*
|
|
310
|
+
* @param deps - Route dependencies.
|
|
311
|
+
*/
|
|
312
|
+
function createReindexHandler(deps) {
|
|
313
|
+
return async (_request, reply) => {
|
|
242
314
|
try {
|
|
243
|
-
const
|
|
244
|
-
|
|
245
|
-
void (async () => {
|
|
246
|
-
try {
|
|
247
|
-
if (scope === 'rules') {
|
|
248
|
-
// Re-apply inference rules to all files, update Qdrant payloads (no re-embedding)
|
|
249
|
-
const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
|
|
250
|
-
for (const file of files) {
|
|
251
|
-
// Use the new processRulesUpdate method
|
|
252
|
-
await processor.processRulesUpdate(file);
|
|
253
|
-
}
|
|
254
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (rules) completed');
|
|
255
|
-
}
|
|
256
|
-
else {
|
|
257
|
-
// Full reindex: re-extract, re-embed, re-upsert
|
|
258
|
-
const files = await listFilesFromGlobs(options.config.watch.paths, options.config.watch.ignored);
|
|
259
|
-
for (const file of files) {
|
|
260
|
-
await processor.processFile(file);
|
|
261
|
-
}
|
|
262
|
-
logger.info({ scope, filesProcessed: files.length }, 'Config reindex (full) completed');
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
catch (error) {
|
|
266
|
-
logger.error({ error, scope }, 'Config reindex failed');
|
|
267
|
-
}
|
|
268
|
-
})();
|
|
269
|
-
return await reply.status(200).send({ status: 'started', scope });
|
|
315
|
+
const count = await processAllFiles(deps.config.watch.paths, deps.config.watch.ignored, deps.processor, 'processFile');
|
|
316
|
+
return await reply.status(200).send({ ok: true, filesIndexed: count });
|
|
270
317
|
}
|
|
271
318
|
catch (error) {
|
|
272
|
-
logger.error({ error }, '
|
|
319
|
+
deps.logger.error({ error }, 'Reindex failed');
|
|
273
320
|
return await reply.status(500).send({ error: 'Internal server error' });
|
|
274
321
|
}
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* @module api/handlers/search
|
|
327
|
+
* Fastify route handler for POST /search. Embeds a query and performs vector store similarity search.
|
|
328
|
+
*/
|
|
329
|
+
/**
|
|
330
|
+
* Create handler for POST /search.
|
|
331
|
+
*
|
|
332
|
+
* @param deps - Route dependencies.
|
|
333
|
+
*/
|
|
334
|
+
function createSearchHandler(deps) {
|
|
335
|
+
return async (request, reply) => {
|
|
336
|
+
try {
|
|
337
|
+
const { query, limit = 10 } = request.body;
|
|
338
|
+
const vectors = await deps.embeddingProvider.embed([query]);
|
|
339
|
+
const results = await deps.vectorStore.search(vectors[0], limit);
|
|
340
|
+
return results;
|
|
341
|
+
}
|
|
342
|
+
catch (error) {
|
|
343
|
+
deps.logger.error({ error }, 'Search failed');
|
|
344
|
+
return reply.status(500).send({ error: 'Internal server error' });
|
|
345
|
+
}
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* @module api/handlers/status
|
|
351
|
+
* Fastify route handler for GET /status. Pure handler: returns process uptime and health.
|
|
352
|
+
*/
|
|
353
|
+
/**
|
|
354
|
+
* Create handler for GET /status.
|
|
355
|
+
*/
|
|
356
|
+
function createStatusHandler() {
|
|
357
|
+
return () => ({
|
|
358
|
+
status: 'ok',
|
|
359
|
+
uptime: process.uptime(),
|
|
275
360
|
});
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Create the Fastify API server with all routes registered.
|
|
365
|
+
*
|
|
366
|
+
* The returned instance is not yet listening — call `server.listen()` to start.
|
|
367
|
+
*
|
|
368
|
+
* @param options - The server options.
|
|
369
|
+
* @returns A configured Fastify instance.
|
|
370
|
+
*/
|
|
371
|
+
function createApiServer(options) {
|
|
372
|
+
const { processor, vectorStore, embeddingProvider, logger, config } = options;
|
|
373
|
+
const app = Fastify({ logger: false });
|
|
374
|
+
app.get('/status', createStatusHandler());
|
|
375
|
+
app.post('/metadata', createMetadataHandler({ processor, logger }));
|
|
376
|
+
app.post('/search', createSearchHandler({ embeddingProvider, vectorStore, logger }));
|
|
377
|
+
app.post('/reindex', createReindexHandler({ config, processor, logger }));
|
|
378
|
+
app.post('/rebuild-metadata', createRebuildMetadataHandler({ config, vectorStore, logger }));
|
|
379
|
+
app.post('/config-reindex', createConfigReindexHandler({ config, processor, logger }));
|
|
276
380
|
return app;
|
|
277
381
|
}
|
|
278
382
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
watch: {
|
|
286
|
-
type: 'object',
|
|
287
|
-
required: ['paths'],
|
|
288
|
-
properties: {
|
|
289
|
-
paths: { type: 'array', items: { type: 'string' }, minItems: 1 },
|
|
290
|
-
ignored: { type: 'array', items: { type: 'string' } },
|
|
291
|
-
pollIntervalMs: { type: 'number' },
|
|
292
|
-
usePolling: { type: 'boolean' },
|
|
293
|
-
debounceMs: { type: 'number' },
|
|
294
|
-
stabilityThresholdMs: { type: 'number' },
|
|
295
|
-
},
|
|
296
|
-
additionalProperties: false,
|
|
297
|
-
},
|
|
298
|
-
configWatch: {
|
|
299
|
-
type: 'object',
|
|
300
|
-
properties: {
|
|
301
|
-
enabled: { type: 'boolean' },
|
|
302
|
-
debounceMs: { type: 'number' },
|
|
303
|
-
},
|
|
304
|
-
additionalProperties: false,
|
|
305
|
-
},
|
|
306
|
-
embedding: {
|
|
307
|
-
type: 'object',
|
|
308
|
-
required: ['provider', 'model'],
|
|
309
|
-
properties: {
|
|
310
|
-
provider: { type: 'string' },
|
|
311
|
-
model: { type: 'string' },
|
|
312
|
-
chunkSize: { type: 'number' },
|
|
313
|
-
chunkOverlap: { type: 'number' },
|
|
314
|
-
dimensions: { type: 'number' },
|
|
315
|
-
apiKey: { type: 'string' },
|
|
316
|
-
rateLimitPerMinute: { type: 'number' },
|
|
317
|
-
concurrency: { type: 'number' },
|
|
318
|
-
},
|
|
319
|
-
additionalProperties: false,
|
|
320
|
-
},
|
|
321
|
-
vectorStore: {
|
|
322
|
-
type: 'object',
|
|
323
|
-
required: ['url', 'collectionName'],
|
|
324
|
-
properties: {
|
|
325
|
-
url: { type: 'string' },
|
|
326
|
-
collectionName: { type: 'string' },
|
|
327
|
-
apiKey: { type: 'string' },
|
|
328
|
-
},
|
|
329
|
-
additionalProperties: false,
|
|
330
|
-
},
|
|
331
|
-
metadataDir: { type: 'string' },
|
|
332
|
-
api: {
|
|
333
|
-
type: 'object',
|
|
334
|
-
properties: {
|
|
335
|
-
host: { type: 'string' },
|
|
336
|
-
port: { type: 'number' },
|
|
337
|
-
},
|
|
338
|
-
additionalProperties: false,
|
|
339
|
-
},
|
|
340
|
-
extractors: { type: 'object' },
|
|
341
|
-
inferenceRules: {
|
|
342
|
-
type: 'array',
|
|
343
|
-
items: {
|
|
344
|
-
type: 'object',
|
|
345
|
-
required: ['match', 'set'],
|
|
346
|
-
properties: {
|
|
347
|
-
match: { type: 'object' },
|
|
348
|
-
set: { type: 'object' },
|
|
349
|
-
},
|
|
350
|
-
additionalProperties: false,
|
|
351
|
-
},
|
|
352
|
-
},
|
|
353
|
-
logging: {
|
|
354
|
-
type: 'object',
|
|
355
|
-
properties: {
|
|
356
|
-
level: { type: 'string' },
|
|
357
|
-
file: { type: 'string' },
|
|
358
|
-
},
|
|
359
|
-
additionalProperties: false,
|
|
360
|
-
},
|
|
361
|
-
shutdownTimeoutMs: { type: 'number' },
|
|
362
|
-
},
|
|
363
|
-
additionalProperties: false,
|
|
364
|
-
};
|
|
365
|
-
const ajv = new Ajv({ allErrors: true });
|
|
366
|
-
const validate = ajv.compile(configSchema);
|
|
367
|
-
/** Default values for optional configuration fields. */
|
|
368
|
-
const DEFAULTS = {
|
|
369
|
-
configWatch: { enabled: true, debounceMs: 1000 },
|
|
383
|
+
/**
|
|
384
|
+
* @module config/defaults
|
|
385
|
+
* Default configuration values for jeeves-watcher. Pure data export, no I/O or side effects.
|
|
386
|
+
*/
|
|
387
|
+
/** Default root-level config values. */
|
|
388
|
+
const ROOT_DEFAULTS = {
|
|
370
389
|
metadataDir: '.jeeves-watcher',
|
|
371
|
-
api: { host: '127.0.0.1', port: 3100 },
|
|
372
|
-
logging: { level: 'info' },
|
|
373
390
|
shutdownTimeoutMs: 10000,
|
|
374
391
|
};
|
|
375
|
-
/** Default values
|
|
392
|
+
/** Default configWatch values. */
|
|
393
|
+
const CONFIG_WATCH_DEFAULTS = {
|
|
394
|
+
enabled: true,
|
|
395
|
+
debounceMs: 1000,
|
|
396
|
+
};
|
|
397
|
+
/** Default API values. */
|
|
398
|
+
const API_DEFAULTS = {
|
|
399
|
+
host: '127.0.0.1',
|
|
400
|
+
port: 3456,
|
|
401
|
+
};
|
|
402
|
+
/** Default logging values. */
|
|
403
|
+
const LOGGING_DEFAULTS = {
|
|
404
|
+
level: 'info',
|
|
405
|
+
};
|
|
406
|
+
/** Default watch configuration. */
|
|
376
407
|
const WATCH_DEFAULTS = {
|
|
377
408
|
debounceMs: 300,
|
|
378
409
|
stabilityThresholdMs: 500,
|
|
379
410
|
usePolling: false,
|
|
380
411
|
pollIntervalMs: 1000,
|
|
381
412
|
};
|
|
382
|
-
/** Default
|
|
413
|
+
/** Default embedding configuration. */
|
|
383
414
|
const EMBEDDING_DEFAULTS = {
|
|
384
415
|
chunkSize: 1000,
|
|
385
416
|
chunkOverlap: 200,
|
|
386
|
-
dimensions:
|
|
417
|
+
dimensions: 3072,
|
|
387
418
|
rateLimitPerMinute: 300,
|
|
388
419
|
concurrency: 5,
|
|
389
420
|
};
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Watch configuration for file system monitoring.
|
|
424
|
+
*/
|
|
425
|
+
const watchConfigSchema = zod.z.object({
|
|
426
|
+
/** Glob patterns to watch. */
|
|
427
|
+
paths: zod.z
|
|
428
|
+
.array(zod.z.string())
|
|
429
|
+
.min(1)
|
|
430
|
+
.describe('Glob patterns for files to watch (e.g., "**/*.md"). At least one required.'),
|
|
431
|
+
/** Glob patterns to ignore. */
|
|
432
|
+
ignored: zod.z
|
|
433
|
+
.array(zod.z.string())
|
|
434
|
+
.optional()
|
|
435
|
+
.describe('Glob patterns to exclude from watching (e.g., "**/node_modules/**").'),
|
|
436
|
+
/** Polling interval in milliseconds. */
|
|
437
|
+
pollIntervalMs: zod.z
|
|
438
|
+
.number()
|
|
439
|
+
.optional()
|
|
440
|
+
.describe('Polling interval in milliseconds when usePolling is enabled.'),
|
|
441
|
+
/** Whether to use polling instead of native watchers. */
|
|
442
|
+
usePolling: zod.z
|
|
443
|
+
.boolean()
|
|
444
|
+
.optional()
|
|
445
|
+
.describe('Use polling instead of native file system events (for network drives).'),
|
|
446
|
+
/** Debounce delay in milliseconds for file change events. */
|
|
447
|
+
debounceMs: zod.z
|
|
448
|
+
.number()
|
|
449
|
+
.optional()
|
|
450
|
+
.describe('Debounce delay in milliseconds for file change events.'),
|
|
451
|
+
/** Time in milliseconds a file must be stable before processing. */
|
|
452
|
+
stabilityThresholdMs: zod.z
|
|
453
|
+
.number()
|
|
454
|
+
.optional()
|
|
455
|
+
.describe('Time in milliseconds a file must remain unchanged before processing.'),
|
|
456
|
+
});
|
|
457
|
+
/**
|
|
458
|
+
* Configuration watch settings.
|
|
459
|
+
*/
|
|
460
|
+
const configWatchConfigSchema = zod.z.object({
|
|
461
|
+
/** Whether config file watching is enabled. */
|
|
462
|
+
enabled: zod.z
|
|
463
|
+
.boolean()
|
|
464
|
+
.optional()
|
|
465
|
+
.describe('Enable automatic reloading when config file changes.'),
|
|
466
|
+
/** Debounce delay in milliseconds for config change events. */
|
|
467
|
+
debounceMs: zod.z
|
|
468
|
+
.number()
|
|
469
|
+
.optional()
|
|
470
|
+
.describe('Debounce delay in milliseconds for config file change detection.'),
|
|
471
|
+
});
|
|
472
|
+
/**
|
|
473
|
+
* Embedding model configuration.
|
|
474
|
+
*/
|
|
475
|
+
const embeddingConfigSchema = zod.z.object({
|
|
476
|
+
/** The embedding model provider. */
|
|
477
|
+
provider: zod.z
|
|
478
|
+
.string()
|
|
479
|
+
.default('gemini')
|
|
480
|
+
.describe('Embedding provider name (e.g., "gemini", "openai").'),
|
|
481
|
+
/** The embedding model name. */
|
|
482
|
+
model: zod.z
|
|
483
|
+
.string()
|
|
484
|
+
.default('gemini-embedding-001')
|
|
485
|
+
.describe('Embedding model identifier (e.g., "gemini-embedding-001", "text-embedding-3-small").'),
|
|
486
|
+
/** Maximum tokens per chunk for splitting. */
|
|
487
|
+
chunkSize: zod.z
|
|
488
|
+
.number()
|
|
489
|
+
.optional()
|
|
490
|
+
.describe('Maximum chunk size in characters for text splitting.'),
|
|
491
|
+
/** Overlap between chunks in tokens. */
|
|
492
|
+
chunkOverlap: zod.z
|
|
493
|
+
.number()
|
|
494
|
+
.optional()
|
|
495
|
+
.describe('Character overlap between consecutive chunks.'),
|
|
496
|
+
/** Embedding vector dimensions. */
|
|
497
|
+
dimensions: zod.z
|
|
498
|
+
.number()
|
|
499
|
+
.optional()
|
|
500
|
+
.describe('Embedding vector dimensions (must match model output).'),
|
|
501
|
+
/** API key for the embedding provider. */
|
|
502
|
+
apiKey: zod.z
|
|
503
|
+
.string()
|
|
504
|
+
.optional()
|
|
505
|
+
.describe('API key for embedding provider (supports ${ENV_VAR} substitution).'),
|
|
506
|
+
/** Maximum embedding requests per minute. */
|
|
507
|
+
rateLimitPerMinute: zod.z
|
|
508
|
+
.number()
|
|
509
|
+
.optional()
|
|
510
|
+
.describe('Maximum embedding API requests per minute (rate limiting).'),
|
|
511
|
+
/** Maximum concurrent embedding requests. */
|
|
512
|
+
concurrency: zod.z
|
|
513
|
+
.number()
|
|
514
|
+
.optional()
|
|
515
|
+
.describe('Maximum concurrent embedding requests.'),
|
|
516
|
+
});
|
|
517
|
+
/**
|
|
518
|
+
* Vector store configuration for Qdrant.
|
|
519
|
+
*/
|
|
520
|
+
const vectorStoreConfigSchema = zod.z.object({
|
|
521
|
+
/** Qdrant server URL. */
|
|
522
|
+
url: zod.z
|
|
523
|
+
.string()
|
|
524
|
+
.describe('Qdrant server URL (e.g., "http://localhost:6333").'),
|
|
525
|
+
/** Qdrant collection name. */
|
|
526
|
+
collectionName: zod.z
|
|
527
|
+
.string()
|
|
528
|
+
.describe('Qdrant collection name for vector storage.'),
|
|
529
|
+
/** Qdrant API key. */
|
|
530
|
+
apiKey: zod.z
|
|
531
|
+
.string()
|
|
532
|
+
.optional()
|
|
533
|
+
.describe('Qdrant API key for authentication (supports ${ENV_VAR} substitution).'),
|
|
534
|
+
});
|
|
535
|
+
/**
|
|
536
|
+
* API server configuration.
|
|
537
|
+
*/
|
|
538
|
+
const apiConfigSchema = zod.z.object({
|
|
539
|
+
/** Host to bind to. */
|
|
540
|
+
host: zod.z
|
|
541
|
+
.string()
|
|
542
|
+
.optional()
|
|
543
|
+
.describe('Host address for API server (e.g., "127.0.0.1", "0.0.0.0").'),
|
|
544
|
+
/** Port to listen on. */
|
|
545
|
+
port: zod.z.number().optional().describe('Port for API server (e.g., 3456).'),
|
|
546
|
+
});
|
|
547
|
+
/**
|
|
548
|
+
* Logging configuration.
|
|
549
|
+
*/
|
|
550
|
+
const loggingConfigSchema = zod.z.object({
|
|
551
|
+
/** Log level. */
|
|
552
|
+
level: zod.z
|
|
553
|
+
.string()
|
|
554
|
+
.optional()
|
|
555
|
+
.describe('Logging level (trace, debug, info, warn, error, fatal).'),
|
|
556
|
+
/** Log file path. */
|
|
557
|
+
file: zod.z
|
|
558
|
+
.string()
|
|
559
|
+
.optional()
|
|
560
|
+
.describe('Path to log file (logs to stdout if omitted).'),
|
|
561
|
+
});
|
|
562
|
+
/**
|
|
563
|
+
* An inference rule that enriches document metadata.
|
|
564
|
+
*/
|
|
565
|
+
const inferenceRuleSchema = zod.z.object({
|
|
566
|
+
/** JSON Schema object to match against document metadata. */
|
|
567
|
+
match: zod.z
|
|
568
|
+
.record(zod.z.string(), zod.z.unknown())
|
|
569
|
+
.describe('JSON Schema object to match against file attributes.'),
|
|
570
|
+
/** Metadata fields to set when the rule matches. */
|
|
571
|
+
set: zod.z
|
|
572
|
+
.record(zod.z.string(), zod.z.unknown())
|
|
573
|
+
.describe('Metadata fields to set when match succeeds.'),
|
|
574
|
+
/** JsonMap transformation (inline or reference to named map). */
|
|
575
|
+
map: zod.z
|
|
576
|
+
.union([jsonmap.jsonMapMapSchema, zod.z.string()])
|
|
577
|
+
.optional()
|
|
578
|
+
.describe('JsonMap transformation (inline definition or named map reference).'),
|
|
579
|
+
});
|
|
580
|
+
/**
|
|
581
|
+
* Top-level configuration for jeeves-watcher.
|
|
582
|
+
*/
|
|
583
|
+
const jeevesWatcherConfigSchema = zod.z.object({
|
|
584
|
+
/** File system watch configuration. */
|
|
585
|
+
watch: watchConfigSchema.describe('File system watch configuration.'),
|
|
586
|
+
/** Configuration file watch settings. */
|
|
587
|
+
configWatch: configWatchConfigSchema
|
|
588
|
+
.optional()
|
|
589
|
+
.describe('Configuration file watch settings.'),
|
|
590
|
+
/** Embedding model configuration. */
|
|
591
|
+
embedding: embeddingConfigSchema.describe('Embedding model configuration.'),
|
|
592
|
+
/** Vector store configuration. */
|
|
593
|
+
vectorStore: vectorStoreConfigSchema.describe('Qdrant vector store configuration.'),
|
|
594
|
+
/** Directory for persisted metadata. */
|
|
595
|
+
metadataDir: zod.z
|
|
596
|
+
.string()
|
|
597
|
+
.optional()
|
|
598
|
+
.describe('Directory for persisted metadata sidecar files.'),
|
|
599
|
+
/** API server configuration. */
|
|
600
|
+
api: apiConfigSchema.optional().describe('API server configuration.'),
|
|
601
|
+
/** Extractor configurations keyed by name. */
|
|
602
|
+
extractors: zod.z
|
|
603
|
+
.record(zod.z.string(), zod.z.unknown())
|
|
604
|
+
.optional()
|
|
605
|
+
.describe('Extractor configurations keyed by name.'),
|
|
606
|
+
/** Rules for inferring metadata from document properties. */
|
|
607
|
+
inferenceRules: zod.z
|
|
608
|
+
.array(inferenceRuleSchema)
|
|
609
|
+
.optional()
|
|
610
|
+
.describe('Rules for inferring metadata from file attributes.'),
|
|
611
|
+
/** Reusable named JsonMap transformations. */
|
|
612
|
+
maps: zod.z
|
|
613
|
+
.record(zod.z.string(), jsonmap.jsonMapMapSchema)
|
|
614
|
+
.optional()
|
|
615
|
+
.describe('Reusable named JsonMap transformations.'),
|
|
616
|
+
/** Logging configuration. */
|
|
617
|
+
logging: loggingConfigSchema.optional().describe('Logging configuration.'),
|
|
618
|
+
/** Timeout in milliseconds for graceful shutdown. */
|
|
619
|
+
shutdownTimeoutMs: zod.z
|
|
620
|
+
.number()
|
|
621
|
+
.optional()
|
|
622
|
+
.describe('Timeout in milliseconds for graceful shutdown.'),
|
|
623
|
+
});
|
|
624
|
+
|
|
625
|
+
const MODULE_NAME = 'jeeves-watcher';
|
|
390
626
|
/**
|
|
391
627
|
* Merge sensible defaults into a loaded configuration.
|
|
392
628
|
*
|
|
@@ -395,13 +631,13 @@
|
|
|
395
631
|
*/
|
|
396
632
|
function applyDefaults(raw) {
|
|
397
633
|
return {
|
|
398
|
-
...
|
|
634
|
+
...ROOT_DEFAULTS,
|
|
399
635
|
...raw,
|
|
400
636
|
watch: { ...WATCH_DEFAULTS, ...raw.watch },
|
|
401
|
-
configWatch: { ...
|
|
637
|
+
configWatch: { ...CONFIG_WATCH_DEFAULTS, ...raw.configWatch },
|
|
402
638
|
embedding: { ...EMBEDDING_DEFAULTS, ...raw.embedding },
|
|
403
|
-
api: { ...
|
|
404
|
-
logging: { ...
|
|
639
|
+
api: { ...API_DEFAULTS, ...raw.api },
|
|
640
|
+
logging: { ...LOGGING_DEFAULTS, ...raw.logging },
|
|
405
641
|
};
|
|
406
642
|
}
|
|
407
643
|
/**
|
|
@@ -419,21 +655,114 @@
|
|
|
419
655
|
if (!result || result.isEmpty) {
|
|
420
656
|
throw new Error('No jeeves-watcher configuration found. Create a .jeeves-watcherrc or jeeves-watcher.config.{js,ts,json,yaml} file.');
|
|
421
657
|
}
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
658
|
+
try {
|
|
659
|
+
const validated = jeevesWatcherConfigSchema.parse(result.config);
|
|
660
|
+
return applyDefaults(validated);
|
|
661
|
+
}
|
|
662
|
+
catch (error) {
|
|
663
|
+
if (error instanceof zod.ZodError) {
|
|
664
|
+
const errors = error.issues
|
|
665
|
+
.map((issue) => `${issue.path.join('.')}: ${issue.message}`)
|
|
666
|
+
.join('; ');
|
|
667
|
+
throw new Error(`Invalid jeeves-watcher configuration: ${errors}`);
|
|
668
|
+
}
|
|
669
|
+
throw error;
|
|
433
670
|
}
|
|
434
|
-
return applyDefaults(raw);
|
|
435
671
|
}
|
|
436
672
|
|
|
673
|
+
/**
|
|
674
|
+
* @module util/logger
|
|
675
|
+
* Logger fallback helper. Provides a unified warn interface that delegates to pino or console.
|
|
676
|
+
*/
|
|
677
|
+
/**
|
|
678
|
+
* Return a minimal logger that delegates to pino if available, otherwise console.
|
|
679
|
+
*
|
|
680
|
+
* @param logger - Optional pino logger instance.
|
|
681
|
+
* @returns A minimal logger.
|
|
682
|
+
*/
|
|
683
|
+
function getLogger(logger) {
|
|
684
|
+
if (logger)
|
|
685
|
+
return logger;
|
|
686
|
+
return {
|
|
687
|
+
warn(obj, msg) {
|
|
688
|
+
if (msg) {
|
|
689
|
+
console.warn(obj, msg);
|
|
690
|
+
}
|
|
691
|
+
else {
|
|
692
|
+
console.warn(obj);
|
|
693
|
+
}
|
|
694
|
+
},
|
|
695
|
+
};
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
/**
|
|
699
|
+
* @module util/retry
|
|
700
|
+
* Small async retry helper with exponential backoff. Side effects: sleeps between attempts; can invoke onRetry callback for logging.
|
|
701
|
+
*/
|
|
702
|
+
function sleep(ms, signal) {
|
|
703
|
+
if (ms <= 0)
|
|
704
|
+
return Promise.resolve();
|
|
705
|
+
return new Promise((resolve, reject) => {
|
|
706
|
+
const timer = setTimeout(() => {
|
|
707
|
+
cleanup();
|
|
708
|
+
resolve();
|
|
709
|
+
}, ms);
|
|
710
|
+
const onAbort = () => {
|
|
711
|
+
cleanup();
|
|
712
|
+
reject(new Error('Retry sleep aborted'));
|
|
713
|
+
};
|
|
714
|
+
const cleanup = () => {
|
|
715
|
+
clearTimeout(timer);
|
|
716
|
+
if (signal)
|
|
717
|
+
signal.removeEventListener('abort', onAbort);
|
|
718
|
+
};
|
|
719
|
+
if (signal) {
|
|
720
|
+
if (signal.aborted) {
|
|
721
|
+
onAbort();
|
|
722
|
+
return;
|
|
723
|
+
}
|
|
724
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
725
|
+
}
|
|
726
|
+
});
|
|
727
|
+
}
|
|
728
|
+
function computeDelayMs(attempt, baseDelayMs, maxDelayMs, jitter = 0) {
|
|
729
|
+
const exp = Math.max(0, attempt - 1);
|
|
730
|
+
const raw = Math.min(maxDelayMs, baseDelayMs * 2 ** exp);
|
|
731
|
+
const factor = jitter > 0 ? 1 + Math.random() * jitter : 1;
|
|
732
|
+
return Math.round(raw * factor);
|
|
733
|
+
}
|
|
734
|
+
/**
|
|
735
|
+
* Retry an async operation using exponential backoff.
|
|
736
|
+
*
|
|
737
|
+
* @param fn - Operation to execute.
|
|
738
|
+
* @param options - Retry policy.
|
|
739
|
+
* @returns The operation result.
|
|
740
|
+
*/
|
|
741
|
+
async function retry(fn, options) {
|
|
742
|
+
const attempts = Math.max(1, options.attempts);
|
|
743
|
+
let lastError;
|
|
744
|
+
for (let attempt = 1; attempt <= attempts; attempt++) {
|
|
745
|
+
try {
|
|
746
|
+
return await fn(attempt);
|
|
747
|
+
}
|
|
748
|
+
catch (error) {
|
|
749
|
+
lastError = error;
|
|
750
|
+
const isLast = attempt >= attempts;
|
|
751
|
+
if (isLast)
|
|
752
|
+
break;
|
|
753
|
+
const delayMs = computeDelayMs(attempt, options.baseDelayMs, options.maxDelayMs, options.jitter);
|
|
754
|
+
options.onRetry?.({ attempt, attempts, delayMs, error });
|
|
755
|
+
await sleep(delayMs, options.signal);
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
throw lastError;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/**
|
|
762
|
+
* @module embedding
|
|
763
|
+
*
|
|
764
|
+
* Embedding provider abstractions and registry-backed factory.
|
|
765
|
+
*/
|
|
437
766
|
/**
|
|
438
767
|
* Create a mock embedding provider that generates deterministic vectors from content hashes.
|
|
439
768
|
*
|
|
@@ -461,14 +790,16 @@
|
|
|
461
790
|
* Create a Gemini embedding provider using the Google Generative AI SDK.
|
|
462
791
|
*
|
|
463
792
|
* @param config - The embedding configuration.
|
|
793
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
464
794
|
* @returns A Gemini {@link EmbeddingProvider}.
|
|
465
795
|
* @throws If the API key is missing.
|
|
466
796
|
*/
|
|
467
|
-
function createGeminiProvider(config) {
|
|
797
|
+
function createGeminiProvider(config, logger) {
|
|
468
798
|
if (!config.apiKey) {
|
|
469
799
|
throw new Error('Gemini embedding provider requires config.embedding.apiKey');
|
|
470
800
|
}
|
|
471
801
|
const dimensions = config.dimensions ?? 3072;
|
|
802
|
+
const log = getLogger(logger);
|
|
472
803
|
const embedder = new googleGenai.GoogleGenerativeAIEmbeddings({
|
|
473
804
|
apiKey: config.apiKey,
|
|
474
805
|
model: config.model,
|
|
@@ -476,8 +807,27 @@
|
|
|
476
807
|
return {
|
|
477
808
|
dimensions,
|
|
478
809
|
async embed(texts) {
|
|
479
|
-
|
|
480
|
-
|
|
810
|
+
const vectors = await retry(async (attempt) => {
|
|
811
|
+
if (attempt > 1) {
|
|
812
|
+
log.warn({ attempt, provider: 'gemini', model: config.model }, 'Retrying embedding request');
|
|
813
|
+
}
|
|
814
|
+
// embedDocuments returns vectors for multiple texts
|
|
815
|
+
return embedder.embedDocuments(texts);
|
|
816
|
+
}, {
|
|
817
|
+
attempts: 5,
|
|
818
|
+
baseDelayMs: 500,
|
|
819
|
+
maxDelayMs: 10_000,
|
|
820
|
+
jitter: 0.2,
|
|
821
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
822
|
+
log.warn({
|
|
823
|
+
attempt,
|
|
824
|
+
delayMs,
|
|
825
|
+
provider: 'gemini',
|
|
826
|
+
model: config.model,
|
|
827
|
+
error,
|
|
828
|
+
}, 'Embedding call failed; will retry');
|
|
829
|
+
},
|
|
830
|
+
});
|
|
481
831
|
// Validate dimensions
|
|
482
832
|
for (const vector of vectors) {
|
|
483
833
|
if (vector.length !== dimensions) {
|
|
@@ -488,25 +838,36 @@
|
|
|
488
838
|
},
|
|
489
839
|
};
|
|
490
840
|
}
|
|
841
|
+
function createMockFromConfig(config) {
|
|
842
|
+
const dimensions = config.dimensions ?? 768;
|
|
843
|
+
return createMockProvider(dimensions);
|
|
844
|
+
}
|
|
845
|
+
const embeddingProviderRegistry = new Map([
|
|
846
|
+
['mock', createMockFromConfig],
|
|
847
|
+
['gemini', createGeminiProvider],
|
|
848
|
+
]);
|
|
491
849
|
/**
|
|
492
850
|
* Create an embedding provider based on the given configuration.
|
|
493
851
|
*
|
|
852
|
+
* Each provider is responsible for its own default dimensions.
|
|
853
|
+
*
|
|
494
854
|
* @param config - The embedding configuration.
|
|
855
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
495
856
|
* @returns An {@link EmbeddingProvider} instance.
|
|
496
857
|
* @throws If the configured provider is not supported.
|
|
497
858
|
*/
|
|
498
|
-
function createEmbeddingProvider(config) {
|
|
499
|
-
const
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
return createMockProvider(dimensions);
|
|
503
|
-
case 'gemini':
|
|
504
|
-
return createGeminiProvider(config);
|
|
505
|
-
default:
|
|
506
|
-
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
859
|
+
function createEmbeddingProvider(config, logger) {
|
|
860
|
+
const factory = embeddingProviderRegistry.get(config.provider);
|
|
861
|
+
if (!factory) {
|
|
862
|
+
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
507
863
|
}
|
|
864
|
+
return factory(config, logger);
|
|
508
865
|
}
|
|
509
866
|
|
|
867
|
+
/**
|
|
868
|
+
* @module logger
|
|
869
|
+
* Creates pino logger instances. I/O: optionally writes logs to file via pino/file transport. Defaults to stdout at info level.
|
|
870
|
+
*/
|
|
510
871
|
/**
|
|
511
872
|
* Create a pino logger instance.
|
|
512
873
|
*
|
|
@@ -525,6 +886,45 @@
|
|
|
525
886
|
return pino({ level });
|
|
526
887
|
}
|
|
527
888
|
|
|
889
|
+
/**
|
|
890
|
+
* @module hash
|
|
891
|
+
* Provides SHA-256 content hashing. Pure function: given text string, returns hex digest. No I/O or side effects.
|
|
892
|
+
*/
|
|
893
|
+
/**
|
|
894
|
+
* Compute a SHA-256 hex digest of the given text.
|
|
895
|
+
*
|
|
896
|
+
* @param text - The input text to hash.
|
|
897
|
+
* @returns The hex-encoded SHA-256 hash.
|
|
898
|
+
*/
|
|
899
|
+
function contentHash(text) {
|
|
900
|
+
return node_crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
/**
|
|
904
|
+
* @module pointId
|
|
905
|
+
* Generates deterministic UUIDv5 point IDs for file paths and chunk indices. Pure function: normalizes paths, returns stable IDs. No I/O.
|
|
906
|
+
*/
|
|
907
|
+
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
908
|
+
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
909
|
+
/**
|
|
910
|
+
* Generate a deterministic UUID v5 point ID for a file (and optional chunk index).
|
|
911
|
+
*
|
|
912
|
+
* @param filePath - The file path.
|
|
913
|
+
* @param chunkIndex - Optional chunk index within the file.
|
|
914
|
+
* @returns A deterministic UUID v5 string.
|
|
915
|
+
*/
|
|
916
|
+
function pointId(filePath, chunkIndex) {
|
|
917
|
+
const key = chunkIndex !== undefined
|
|
918
|
+
? `${normalizePath(filePath)}#${String(chunkIndex)}`
|
|
919
|
+
: normalizePath(filePath);
|
|
920
|
+
return uuid.v5(key, NAMESPACE);
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
/**
|
|
924
|
+
* @module extractors
|
|
925
|
+
*
|
|
926
|
+
* Text extraction registry for supported file formats.
|
|
927
|
+
*/
|
|
528
928
|
/**
|
|
529
929
|
* Extract YAML frontmatter from a Markdown document.
|
|
530
930
|
*
|
|
@@ -570,6 +970,55 @@
|
|
|
570
970
|
}
|
|
571
971
|
return JSON.stringify(obj);
|
|
572
972
|
}
|
|
973
|
+
async function extractMarkdown(filePath) {
|
|
974
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
975
|
+
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
976
|
+
return { text: body, frontmatter };
|
|
977
|
+
}
|
|
978
|
+
async function extractPlaintext(filePath) {
|
|
979
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
980
|
+
return { text: raw };
|
|
981
|
+
}
|
|
982
|
+
async function extractJson(filePath) {
|
|
983
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
984
|
+
const parsed = JSON.parse(raw);
|
|
985
|
+
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
986
|
+
? parsed
|
|
987
|
+
: undefined;
|
|
988
|
+
return { text: extractJsonText(parsed), json };
|
|
989
|
+
}
|
|
990
|
+
async function extractPdf(filePath) {
|
|
991
|
+
const buffer = await promises.readFile(filePath);
|
|
992
|
+
const uint8Array = new Uint8Array(buffer);
|
|
993
|
+
const { extractText: extractPdfText } = await import('unpdf');
|
|
994
|
+
const { text } = await extractPdfText(uint8Array);
|
|
995
|
+
// unpdf returns an array of strings (one per page)
|
|
996
|
+
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
997
|
+
return { text: content };
|
|
998
|
+
}
|
|
999
|
+
async function extractDocx(filePath) {
|
|
1000
|
+
const buffer = await promises.readFile(filePath);
|
|
1001
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
1002
|
+
return { text: result.value };
|
|
1003
|
+
}
|
|
1004
|
+
async function extractHtml(filePath) {
|
|
1005
|
+
const raw = await promises.readFile(filePath, 'utf8');
|
|
1006
|
+
const $ = cheerio__namespace.load(raw);
|
|
1007
|
+
$('script, style').remove();
|
|
1008
|
+
const text = $('body').text().trim() || $.text().trim();
|
|
1009
|
+
return { text };
|
|
1010
|
+
}
|
|
1011
|
+
const extractorRegistry = new Map([
|
|
1012
|
+
['.md', extractMarkdown],
|
|
1013
|
+
['.markdown', extractMarkdown],
|
|
1014
|
+
['.txt', extractPlaintext],
|
|
1015
|
+
['.text', extractPlaintext],
|
|
1016
|
+
['.json', extractJson],
|
|
1017
|
+
['.pdf', extractPdf],
|
|
1018
|
+
['.docx', extractDocx],
|
|
1019
|
+
['.html', extractHtml],
|
|
1020
|
+
['.htm', extractHtml],
|
|
1021
|
+
]);
|
|
573
1022
|
/**
|
|
574
1023
|
* Extract text from a file based on extension.
|
|
575
1024
|
*
|
|
@@ -578,87 +1027,132 @@
|
|
|
578
1027
|
* @returns Extracted text and optional structured data.
|
|
579
1028
|
*/
|
|
580
1029
|
async function extractText(filePath, extension) {
|
|
581
|
-
const
|
|
582
|
-
if (
|
|
583
|
-
|
|
584
|
-
const { frontmatter, body } = extractMarkdownFrontmatter(raw);
|
|
585
|
-
return { text: body, frontmatter };
|
|
586
|
-
}
|
|
587
|
-
if (ext === '.txt' || ext === '.text') {
|
|
588
|
-
const raw = await promises.readFile(filePath, 'utf8');
|
|
589
|
-
return { text: raw };
|
|
590
|
-
}
|
|
591
|
-
if (ext === '.json') {
|
|
592
|
-
const raw = await promises.readFile(filePath, 'utf8');
|
|
593
|
-
const parsed = JSON.parse(raw);
|
|
594
|
-
const json = parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
595
|
-
? parsed
|
|
596
|
-
: undefined;
|
|
597
|
-
return { text: extractJsonText(parsed), json };
|
|
598
|
-
}
|
|
599
|
-
if (ext === '.pdf') {
|
|
600
|
-
const buffer = await promises.readFile(filePath);
|
|
601
|
-
const uint8Array = new Uint8Array(buffer);
|
|
602
|
-
const { extractText: extractPdfText } = await import('unpdf');
|
|
603
|
-
const { text } = await extractPdfText(uint8Array);
|
|
604
|
-
// unpdf returns an array of strings (one per page)
|
|
605
|
-
const content = Array.isArray(text) ? text.join('\n\n') : text;
|
|
606
|
-
return { text: content };
|
|
607
|
-
}
|
|
608
|
-
if (ext === '.docx') {
|
|
609
|
-
const buffer = await promises.readFile(filePath);
|
|
610
|
-
const result = await mammoth.extractRawText({ buffer });
|
|
611
|
-
return { text: result.value };
|
|
612
|
-
}
|
|
613
|
-
if (ext === '.html' || ext === '.htm') {
|
|
614
|
-
const raw = await promises.readFile(filePath, 'utf8');
|
|
615
|
-
const $ = cheerio__namespace.load(raw);
|
|
616
|
-
// Remove script and style elements
|
|
617
|
-
$('script, style').remove();
|
|
618
|
-
// Extract text content
|
|
619
|
-
const text = $('body').text().trim() || $.text().trim();
|
|
620
|
-
return { text };
|
|
621
|
-
}
|
|
1030
|
+
const extractor = extractorRegistry.get(extension.toLowerCase());
|
|
1031
|
+
if (extractor)
|
|
1032
|
+
return extractor(filePath);
|
|
622
1033
|
// Default: treat as plaintext.
|
|
623
|
-
|
|
624
|
-
return { text: raw };
|
|
1034
|
+
return extractPlaintext(filePath);
|
|
625
1035
|
}
|
|
626
1036
|
|
|
627
1037
|
/**
|
|
628
|
-
*
|
|
1038
|
+
* @module rules/templates
|
|
1039
|
+
* Resolves template variables (`${path.to.value}`) in rule `set` objects against file attributes.
|
|
1040
|
+
*/
|
|
1041
|
+
/**
|
|
1042
|
+
* Resolve `${template.vars}` in a value against the given attributes.
|
|
629
1043
|
*
|
|
630
|
-
* @param
|
|
631
|
-
* @
|
|
1044
|
+
* @param value - The value to resolve.
|
|
1045
|
+
* @param attributes - The file attributes for variable lookup.
|
|
1046
|
+
* @returns The resolved value.
|
|
632
1047
|
*/
|
|
633
|
-
function
|
|
634
|
-
|
|
1048
|
+
function resolveTemplateVars(value, attributes) {
|
|
1049
|
+
if (typeof value !== 'string')
|
|
1050
|
+
return value;
|
|
1051
|
+
return value.replace(/\$\{([^}]+)\}/g, (_match, varPath) => {
|
|
1052
|
+
const current = radash.get(attributes, varPath);
|
|
1053
|
+
if (current === null || current === undefined)
|
|
1054
|
+
return '';
|
|
1055
|
+
return typeof current === 'string' ? current : JSON.stringify(current);
|
|
1056
|
+
});
|
|
1057
|
+
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Resolve all template variables in a `set` object.
|
|
1060
|
+
*
|
|
1061
|
+
* @param setObj - The key-value pairs to resolve.
|
|
1062
|
+
* @param attributes - The file attributes for variable lookup.
|
|
1063
|
+
* @returns The resolved key-value pairs.
|
|
1064
|
+
*/
|
|
1065
|
+
function resolveSet(setObj, attributes) {
|
|
1066
|
+
const result = {};
|
|
1067
|
+
for (const [key, value] of Object.entries(setObj)) {
|
|
1068
|
+
result[key] = resolveTemplateVars(value, attributes);
|
|
1069
|
+
}
|
|
1070
|
+
return result;
|
|
635
1071
|
}
|
|
636
1072
|
|
|
637
|
-
/** Namespace UUID for jeeves-watcher point IDs. */
|
|
638
|
-
const NAMESPACE = '6a6f686e-6761-4c74-ad6a-656576657321';
|
|
639
1073
|
/**
|
|
640
|
-
*
|
|
1074
|
+
* @module rules/apply
|
|
1075
|
+
* Applies compiled inference rules to file attributes, producing merged metadata via template resolution and JsonMap transforms.
|
|
1076
|
+
*/
|
|
1077
|
+
/**
|
|
1078
|
+
* Create the lib object for JsonMap transformations.
|
|
641
1079
|
*
|
|
642
|
-
* @
|
|
643
|
-
* @returns The normalised path string.
|
|
1080
|
+
* @returns The lib object.
|
|
644
1081
|
*/
|
|
645
|
-
function
|
|
646
|
-
return
|
|
1082
|
+
function createJsonMapLib() {
|
|
1083
|
+
return {
|
|
1084
|
+
split: (str, separator) => str.split(separator),
|
|
1085
|
+
slice: (arr, start, end) => arr.slice(start, end),
|
|
1086
|
+
join: (arr, separator) => arr.join(separator),
|
|
1087
|
+
toLowerCase: (str) => str.toLowerCase(),
|
|
1088
|
+
replace: (str, search, replacement) => str.replace(search, replacement),
|
|
1089
|
+
get: (obj, path) => radash.get(obj, path),
|
|
1090
|
+
};
|
|
647
1091
|
}
|
|
648
1092
|
/**
|
|
649
|
-
*
|
|
1093
|
+
* Apply compiled inference rules to file attributes, returning merged metadata.
|
|
650
1094
|
*
|
|
651
|
-
*
|
|
652
|
-
*
|
|
653
|
-
*
|
|
1095
|
+
* Rules are evaluated in order; later rules override earlier ones.
|
|
1096
|
+
* If a rule has a `map`, the JsonMap transformation is applied after `set` resolution,
|
|
1097
|
+
* and map output overrides set output on conflict.
|
|
1098
|
+
*
|
|
1099
|
+
* @param compiledRules - The compiled rules to evaluate.
|
|
1100
|
+
* @param attributes - The file attributes to match against.
|
|
1101
|
+
* @param namedMaps - Optional record of named JsonMap definitions.
|
|
1102
|
+
* @param logger - Optional logger for warnings (falls back to console.warn).
|
|
1103
|
+
* @returns The merged metadata from all matching rules.
|
|
654
1104
|
*/
|
|
655
|
-
function
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
1105
|
+
async function applyRules(compiledRules, attributes, namedMaps, logger) {
|
|
1106
|
+
// JsonMap's type definitions expect a generic JsonMapLib shape with unary functions.
|
|
1107
|
+
// Our helper functions accept multiple args, which JsonMap supports at runtime.
|
|
1108
|
+
const lib = createJsonMapLib();
|
|
1109
|
+
let merged = {};
|
|
1110
|
+
const log = logger ?? console;
|
|
1111
|
+
for (const { rule, validate } of compiledRules) {
|
|
1112
|
+
if (validate(attributes)) {
|
|
1113
|
+
// Apply set resolution
|
|
1114
|
+
const setOutput = resolveSet(rule.set, attributes);
|
|
1115
|
+
merged = { ...merged, ...setOutput };
|
|
1116
|
+
// Apply map transformation if present
|
|
1117
|
+
if (rule.map) {
|
|
1118
|
+
let mapDef;
|
|
1119
|
+
// Resolve map reference
|
|
1120
|
+
if (typeof rule.map === 'string') {
|
|
1121
|
+
mapDef = namedMaps?.[rule.map];
|
|
1122
|
+
if (!mapDef) {
|
|
1123
|
+
log.warn(`Map reference "${rule.map}" not found in named maps. Skipping map transformation.`);
|
|
1124
|
+
continue;
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
else {
|
|
1128
|
+
mapDef = rule.map;
|
|
1129
|
+
}
|
|
1130
|
+
// Execute JsonMap transformation
|
|
1131
|
+
try {
|
|
1132
|
+
const jsonMap = new jsonmap.JsonMap(mapDef, lib);
|
|
1133
|
+
const mapOutput = await jsonMap.transform(attributes);
|
|
1134
|
+
if (mapOutput &&
|
|
1135
|
+
typeof mapOutput === 'object' &&
|
|
1136
|
+
!Array.isArray(mapOutput)) {
|
|
1137
|
+
merged = { ...merged, ...mapOutput };
|
|
1138
|
+
}
|
|
1139
|
+
else {
|
|
1140
|
+
log.warn(`JsonMap transformation did not return an object; skipping merge.`);
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
catch (error) {
|
|
1144
|
+
log.warn(`JsonMap transformation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
return merged;
|
|
660
1150
|
}
|
|
661
1151
|
|
|
1152
|
+
/**
|
|
1153
|
+
* @module rules/attributes
|
|
1154
|
+
* Builds file attribute objects for rule matching. Pure function: derives attributes from path, stats, and extracted data.
|
|
1155
|
+
*/
|
|
662
1156
|
/**
|
|
663
1157
|
* Build {@link FileAttributes} from a file path and stat info.
|
|
664
1158
|
*
|
|
@@ -686,10 +1180,15 @@
|
|
|
686
1180
|
attrs.json = extractedJson;
|
|
687
1181
|
return attrs;
|
|
688
1182
|
}
|
|
1183
|
+
|
|
1184
|
+
/**
|
|
1185
|
+
* @module rules/ajvSetup
|
|
1186
|
+
* AJV instance factory with custom glob keyword for picomatch-based pattern matching in rule schemas.
|
|
1187
|
+
*/
|
|
689
1188
|
/**
|
|
690
|
-
* Create an
|
|
1189
|
+
* Create an AJV instance with a custom `glob` format for picomatch glob matching.
|
|
691
1190
|
*
|
|
692
|
-
* @returns The configured
|
|
1191
|
+
* @returns The configured AJV instance.
|
|
693
1192
|
*/
|
|
694
1193
|
function createRuleAjv() {
|
|
695
1194
|
const ajv = new Ajv({ allErrors: true });
|
|
@@ -702,6 +1201,11 @@
|
|
|
702
1201
|
});
|
|
703
1202
|
return ajv;
|
|
704
1203
|
}
|
|
1204
|
+
|
|
1205
|
+
/**
|
|
1206
|
+
* @module rules/compile
|
|
1207
|
+
* Compiles inference rule definitions into executable AJV validators for efficient rule evaluation.
|
|
1208
|
+
*/
|
|
705
1209
|
/**
|
|
706
1210
|
* Compile an array of inference rules into executable validators.
|
|
707
1211
|
*
|
|
@@ -718,62 +1222,95 @@
|
|
|
718
1222
|
}),
|
|
719
1223
|
}));
|
|
720
1224
|
}
|
|
1225
|
+
|
|
1226
|
+
/**
|
|
1227
|
+
* @module processor/buildMetadata
|
|
1228
|
+
* Builds merged metadata from file content, inference rules, and enrichment. I/O: reads files, extracts text, loads enrichment .meta.json.
|
|
1229
|
+
*/
|
|
721
1230
|
/**
|
|
722
|
-
*
|
|
1231
|
+
* Build merged metadata for a file by applying inference rules and merging with enrichment metadata.
|
|
723
1232
|
*
|
|
724
|
-
* @param
|
|
725
|
-
* @param
|
|
726
|
-
* @
|
|
1233
|
+
* @param filePath - The file to process.
|
|
1234
|
+
* @param compiledRules - The compiled inference rules.
|
|
1235
|
+
* @param metadataDir - The metadata directory for enrichment files.
|
|
1236
|
+
* @param maps - Optional named JsonMap definitions.
|
|
1237
|
+
* @param logger - Optional logger for rule warnings.
|
|
1238
|
+
* @returns The merged metadata and intermediate data.
|
|
727
1239
|
*/
|
|
728
|
-
function
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
}
|
|
1240
|
+
async function buildMergedMetadata(filePath, compiledRules, metadataDir, maps, logger) {
|
|
1241
|
+
const ext = node_path.extname(filePath);
|
|
1242
|
+
const stats = await promises.stat(filePath);
|
|
1243
|
+
// 1. Extract text and structured data
|
|
1244
|
+
const extracted = await extractText(filePath, ext);
|
|
1245
|
+
// 2. Build attributes + apply rules
|
|
1246
|
+
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
1247
|
+
const inferred = await applyRules(compiledRules, attributes, maps, logger);
|
|
1248
|
+
// 3. Read enrichment metadata (merge, enrichment wins)
|
|
1249
|
+
const enrichment = await readMetadata(filePath, metadataDir);
|
|
1250
|
+
const metadata = {
|
|
1251
|
+
...inferred,
|
|
1252
|
+
...(enrichment ?? {}),
|
|
1253
|
+
};
|
|
1254
|
+
return { inferred, enrichment, metadata, attributes, extracted };
|
|
743
1255
|
}
|
|
1256
|
+
|
|
744
1257
|
/**
|
|
745
|
-
*
|
|
1258
|
+
* @module processor/chunkIds
|
|
1259
|
+
* Generates chunk point IDs from file paths and chunk indices. Extracts chunk counts from Qdrant payloads. Pure functions, no I/O.
|
|
1260
|
+
*/
|
|
1261
|
+
/**
|
|
1262
|
+
* Generate an array of chunk IDs for a file.
|
|
746
1263
|
*
|
|
747
|
-
* @param
|
|
748
|
-
* @param
|
|
749
|
-
* @returns
|
|
1264
|
+
* @param filePath - The file path.
|
|
1265
|
+
* @param totalChunks - The total number of chunks.
|
|
1266
|
+
* @returns An array of point IDs for each chunk.
|
|
750
1267
|
*/
|
|
751
|
-
function
|
|
752
|
-
const
|
|
753
|
-
for (
|
|
754
|
-
|
|
1268
|
+
function chunkIds(filePath, totalChunks) {
|
|
1269
|
+
const ids = [];
|
|
1270
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
1271
|
+
ids.push(pointId(filePath, i));
|
|
755
1272
|
}
|
|
756
|
-
return
|
|
1273
|
+
return ids;
|
|
757
1274
|
}
|
|
758
1275
|
/**
|
|
759
|
-
*
|
|
1276
|
+
* Extract the total chunk count from a payload, with a fallback.
|
|
760
1277
|
*
|
|
761
|
-
*
|
|
1278
|
+
* @param payload - The Qdrant point payload (or null).
|
|
1279
|
+
* @param fallback - The fallback value if total_chunks is missing or invalid.
|
|
1280
|
+
* @returns The total chunk count.
|
|
1281
|
+
*/
|
|
1282
|
+
function getChunkCount(payload, fallback = 1) {
|
|
1283
|
+
if (!payload)
|
|
1284
|
+
return fallback;
|
|
1285
|
+
const count = payload['total_chunks'];
|
|
1286
|
+
return typeof count === 'number' ? count : fallback;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
/**
|
|
1290
|
+
* @module processor/splitter
|
|
1291
|
+
* Factory for LangChain text splitters. Returns MarkdownTextSplitter or RecursiveCharacterTextSplitter based on file extension. No I/O.
|
|
1292
|
+
*/
|
|
1293
|
+
/**
|
|
1294
|
+
* Create the appropriate text splitter for the given file extension.
|
|
762
1295
|
*
|
|
763
|
-
* @param
|
|
764
|
-
* @param
|
|
765
|
-
* @
|
|
1296
|
+
* @param ext - File extension (including leading dot).
|
|
1297
|
+
* @param chunkSize - Maximum chunk size in characters.
|
|
1298
|
+
* @param chunkOverlap - Overlap between chunks in characters.
|
|
1299
|
+
* @returns A text splitter instance.
|
|
766
1300
|
*/
|
|
767
|
-
function
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
merged = { ...merged, ...resolveSet(rule.set, attributes) };
|
|
772
|
-
}
|
|
1301
|
+
function createSplitter(ext, chunkSize, chunkOverlap) {
|
|
1302
|
+
const lowerExt = ext.toLowerCase();
|
|
1303
|
+
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1304
|
+
return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
773
1305
|
}
|
|
774
|
-
return
|
|
1306
|
+
return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
775
1307
|
}
|
|
776
1308
|
|
|
1309
|
+
/**
|
|
1310
|
+
* @module processor
|
|
1311
|
+
*
|
|
1312
|
+
* Core document processing pipeline. Handles extracting text, computing embeddings, syncing with vector store.
|
|
1313
|
+
*/
|
|
777
1314
|
/**
|
|
778
1315
|
* Core document processing pipeline.
|
|
779
1316
|
*
|
|
@@ -785,11 +1322,10 @@
|
|
|
785
1322
|
vectorStore;
|
|
786
1323
|
compiledRules;
|
|
787
1324
|
logger;
|
|
788
|
-
metadataDir;
|
|
789
1325
|
/**
|
|
790
1326
|
* Create a new DocumentProcessor.
|
|
791
1327
|
*
|
|
792
|
-
* @param config - The
|
|
1328
|
+
* @param config - The processor configuration.
|
|
793
1329
|
* @param embeddingProvider - The embedding provider.
|
|
794
1330
|
* @param vectorStore - The vector store client.
|
|
795
1331
|
* @param compiledRules - The compiled inference rules.
|
|
@@ -801,7 +1337,6 @@
|
|
|
801
1337
|
this.vectorStore = vectorStore;
|
|
802
1338
|
this.compiledRules = compiledRules;
|
|
803
1339
|
this.logger = logger;
|
|
804
|
-
this.metadataDir = config.metadataDir ?? '.jeeves-metadata';
|
|
805
1340
|
}
|
|
806
1341
|
/**
|
|
807
1342
|
* Process a file through the full pipeline: extract, hash, chunk, embed, upsert.
|
|
@@ -811,9 +1346,8 @@
|
|
|
811
1346
|
async processFile(filePath) {
|
|
812
1347
|
try {
|
|
813
1348
|
const ext = node_path.extname(filePath);
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
const extracted = await extractText(filePath, ext);
|
|
1349
|
+
// 1. Build merged metadata + extract text
|
|
1350
|
+
const { metadata, extracted } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
817
1351
|
if (!extracted.text.trim()) {
|
|
818
1352
|
this.logger.debug({ filePath }, 'Skipping empty file');
|
|
819
1353
|
return;
|
|
@@ -826,26 +1360,15 @@
|
|
|
826
1360
|
this.logger.debug({ filePath }, 'Content unchanged, skipping');
|
|
827
1361
|
return;
|
|
828
1362
|
}
|
|
829
|
-
const oldTotalChunks =
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
const
|
|
834
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
835
|
-
// 4. Read enrichment metadata (merge, enrichment wins)
|
|
836
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
837
|
-
const metadata = {
|
|
838
|
-
...inferred,
|
|
839
|
-
...(enrichment ?? {}),
|
|
840
|
-
};
|
|
841
|
-
// 5. Chunk text
|
|
842
|
-
const chunkSize = this.config.embedding.chunkSize ?? 1000;
|
|
843
|
-
const chunkOverlap = this.config.embedding.chunkOverlap ?? 200;
|
|
844
|
-
const splitter = this.createSplitter(ext, chunkSize, chunkOverlap);
|
|
1363
|
+
const oldTotalChunks = getChunkCount(existingPayload);
|
|
1364
|
+
// 3. Chunk text
|
|
1365
|
+
const chunkSize = this.config.chunkSize ?? 1000;
|
|
1366
|
+
const chunkOverlap = this.config.chunkOverlap ?? 200;
|
|
1367
|
+
const splitter = createSplitter(ext, chunkSize, chunkOverlap);
|
|
845
1368
|
const chunks = await splitter.splitText(extracted.text);
|
|
846
|
-
//
|
|
1369
|
+
// 4. Embed all chunks
|
|
847
1370
|
const vectors = await this.embeddingProvider.embed(chunks);
|
|
848
|
-
//
|
|
1371
|
+
// 5. Upsert all chunk points
|
|
849
1372
|
const points = chunks.map((chunk, i) => ({
|
|
850
1373
|
id: pointId(filePath, i),
|
|
851
1374
|
vector: vectors[i],
|
|
@@ -859,12 +1382,9 @@
|
|
|
859
1382
|
},
|
|
860
1383
|
}));
|
|
861
1384
|
await this.vectorStore.upsert(points);
|
|
862
|
-
//
|
|
1385
|
+
// 6. Clean up orphaned chunks
|
|
863
1386
|
if (oldTotalChunks > chunks.length) {
|
|
864
|
-
const orphanIds =
|
|
865
|
-
for (let i = chunks.length; i < oldTotalChunks; i++) {
|
|
866
|
-
orphanIds.push(pointId(filePath, i));
|
|
867
|
-
}
|
|
1387
|
+
const orphanIds = chunkIds(filePath, oldTotalChunks).slice(chunks.length);
|
|
868
1388
|
await this.vectorStore.delete(orphanIds);
|
|
869
1389
|
}
|
|
870
1390
|
this.logger.info({ filePath, chunks: chunks.length }, 'File processed successfully');
|
|
@@ -883,15 +1403,10 @@
|
|
|
883
1403
|
// Get the existing payload to find total chunks
|
|
884
1404
|
const baseId = pointId(filePath, 0);
|
|
885
1405
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
886
|
-
const totalChunks =
|
|
887
|
-
|
|
888
|
-
: 1;
|
|
889
|
-
const ids = [];
|
|
890
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
891
|
-
ids.push(pointId(filePath, i));
|
|
892
|
-
}
|
|
1406
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1407
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
893
1408
|
await this.vectorStore.delete(ids);
|
|
894
|
-
await deleteMetadata(filePath, this.metadataDir);
|
|
1409
|
+
await deleteMetadata(filePath, this.config.metadataDir);
|
|
895
1410
|
this.logger.info({ filePath }, 'File deleted from index');
|
|
896
1411
|
}
|
|
897
1412
|
catch (error) {
|
|
@@ -908,21 +1423,16 @@
|
|
|
908
1423
|
async processMetadataUpdate(filePath, metadata) {
|
|
909
1424
|
try {
|
|
910
1425
|
// Read existing enrichment metadata and merge
|
|
911
|
-
const existing = (await readMetadata(filePath, this.metadataDir)) ?? {};
|
|
1426
|
+
const existing = (await readMetadata(filePath, this.config.metadataDir)) ?? {};
|
|
912
1427
|
const merged = { ...existing, ...metadata };
|
|
913
|
-
await writeMetadata(filePath, this.metadataDir, merged);
|
|
1428
|
+
await writeMetadata(filePath, this.config.metadataDir, merged);
|
|
914
1429
|
// Update all chunk payloads in Qdrant
|
|
915
1430
|
const baseId = pointId(filePath, 0);
|
|
916
1431
|
const existingPayload = await this.vectorStore.getPayload(baseId);
|
|
917
1432
|
if (!existingPayload)
|
|
918
1433
|
return null;
|
|
919
|
-
const totalChunks =
|
|
920
|
-
|
|
921
|
-
: 1;
|
|
922
|
-
const ids = [];
|
|
923
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
924
|
-
ids.push(pointId(filePath, i));
|
|
925
|
-
}
|
|
1434
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1435
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
926
1436
|
await this.vectorStore.setPayload(ids, merged);
|
|
927
1437
|
this.logger.info({ filePath, chunks: totalChunks }, 'Metadata updated');
|
|
928
1438
|
return merged;
|
|
@@ -948,27 +1458,11 @@
|
|
|
948
1458
|
this.logger.debug({ filePath }, 'File not indexed, skipping');
|
|
949
1459
|
return null;
|
|
950
1460
|
}
|
|
951
|
-
|
|
952
|
-
const
|
|
953
|
-
// Extract frontmatter/json for attribute building (lightweight)
|
|
954
|
-
const extracted = await extractText(filePath, ext);
|
|
955
|
-
// Build attributes + apply current rules
|
|
956
|
-
const attributes = buildAttributes(filePath, stats, extracted.frontmatter, extracted.json);
|
|
957
|
-
const inferred = applyRules(this.compiledRules, attributes);
|
|
958
|
-
// Read enrichment metadata (merge, enrichment wins)
|
|
959
|
-
const enrichment = await readMetadata(filePath, this.metadataDir);
|
|
960
|
-
const metadata = {
|
|
961
|
-
...inferred,
|
|
962
|
-
...(enrichment ?? {}),
|
|
963
|
-
};
|
|
1461
|
+
// Build merged metadata (lightweight — no embedding)
|
|
1462
|
+
const { metadata } = await buildMergedMetadata(filePath, this.compiledRules, this.config.metadataDir, this.config.maps, this.logger);
|
|
964
1463
|
// Update all chunk payloads
|
|
965
|
-
const totalChunks =
|
|
966
|
-
|
|
967
|
-
: 1;
|
|
968
|
-
const ids = [];
|
|
969
|
-
for (let i = 0; i < totalChunks; i++) {
|
|
970
|
-
ids.push(pointId(filePath, i));
|
|
971
|
-
}
|
|
1464
|
+
const totalChunks = getChunkCount(existingPayload);
|
|
1465
|
+
const ids = chunkIds(filePath, totalChunks);
|
|
972
1466
|
await this.vectorStore.setPayload(ids, metadata);
|
|
973
1467
|
this.logger.info({ filePath, chunks: totalChunks }, 'Rules re-applied');
|
|
974
1468
|
return metadata;
|
|
@@ -987,23 +1481,12 @@
|
|
|
987
1481
|
this.compiledRules = compiledRules;
|
|
988
1482
|
this.logger.info({ rules: compiledRules.length }, 'Inference rules updated');
|
|
989
1483
|
}
|
|
990
|
-
/**
|
|
991
|
-
* Create the appropriate text splitter for the given file extension.
|
|
992
|
-
*
|
|
993
|
-
* @param ext - File extension.
|
|
994
|
-
* @param chunkSize - Maximum chunk size in characters.
|
|
995
|
-
* @param chunkOverlap - Overlap between chunks in characters.
|
|
996
|
-
* @returns A text splitter instance.
|
|
997
|
-
*/
|
|
998
|
-
createSplitter(ext, chunkSize, chunkOverlap) {
|
|
999
|
-
const lowerExt = ext.toLowerCase();
|
|
1000
|
-
if (lowerExt === '.md' || lowerExt === '.markdown') {
|
|
1001
|
-
return new textsplitters.MarkdownTextSplitter({ chunkSize, chunkOverlap });
|
|
1002
|
-
}
|
|
1003
|
-
return new textsplitters.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
|
|
1004
|
-
}
|
|
1005
1484
|
}
|
|
1006
1485
|
|
|
1486
|
+
/**
|
|
1487
|
+
* @module queue
|
|
1488
|
+
* Debounced, rate-limited, concurrent event queue for file watchers. Manages priority queuing and async callbacks. No direct I/O; orchestrates processing.
|
|
1489
|
+
*/
|
|
1007
1490
|
/**
|
|
1008
1491
|
* A debounced, rate-limited, concurrent event queue.
|
|
1009
1492
|
*/
|
|
@@ -1152,19 +1635,23 @@
|
|
|
1152
1635
|
client;
|
|
1153
1636
|
collectionName;
|
|
1154
1637
|
dims;
|
|
1638
|
+
log;
|
|
1155
1639
|
/**
|
|
1156
1640
|
* Create a new VectorStoreClient.
|
|
1157
1641
|
*
|
|
1158
1642
|
* @param config - Vector store configuration.
|
|
1159
1643
|
* @param dimensions - The embedding vector dimensions.
|
|
1644
|
+
* @param logger - Optional pino logger for retry warnings.
|
|
1160
1645
|
*/
|
|
1161
|
-
constructor(config, dimensions) {
|
|
1646
|
+
constructor(config, dimensions, logger) {
|
|
1162
1647
|
this.client = new jsClientRest.QdrantClient({
|
|
1163
1648
|
url: config.url,
|
|
1164
1649
|
apiKey: config.apiKey,
|
|
1650
|
+
checkCompatibility: false,
|
|
1165
1651
|
});
|
|
1166
1652
|
this.collectionName = config.collectionName;
|
|
1167
1653
|
this.dims = dimensions;
|
|
1654
|
+
this.log = getLogger(logger);
|
|
1168
1655
|
}
|
|
1169
1656
|
/**
|
|
1170
1657
|
* Ensure the collection exists with correct dimensions and Cosine distance.
|
|
@@ -1191,13 +1678,26 @@
|
|
|
1191
1678
|
async upsert(points) {
|
|
1192
1679
|
if (points.length === 0)
|
|
1193
1680
|
return;
|
|
1194
|
-
await
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1681
|
+
await retry(async (attempt) => {
|
|
1682
|
+
if (attempt > 1) {
|
|
1683
|
+
this.log.warn({ attempt, operation: 'qdrant.upsert', points: points.length }, 'Retrying Qdrant upsert');
|
|
1684
|
+
}
|
|
1685
|
+
await this.client.upsert(this.collectionName, {
|
|
1686
|
+
wait: true,
|
|
1687
|
+
points: points.map((p) => ({
|
|
1688
|
+
id: p.id,
|
|
1689
|
+
vector: p.vector,
|
|
1690
|
+
payload: p.payload,
|
|
1691
|
+
})),
|
|
1692
|
+
});
|
|
1693
|
+
}, {
|
|
1694
|
+
attempts: 5,
|
|
1695
|
+
baseDelayMs: 500,
|
|
1696
|
+
maxDelayMs: 10_000,
|
|
1697
|
+
jitter: 0.2,
|
|
1698
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1699
|
+
this.log.warn({ attempt, delayMs, operation: 'qdrant.upsert', error }, 'Qdrant upsert failed; will retry');
|
|
1700
|
+
},
|
|
1201
1701
|
});
|
|
1202
1702
|
}
|
|
1203
1703
|
/**
|
|
@@ -1208,9 +1708,22 @@
|
|
|
1208
1708
|
async delete(ids) {
|
|
1209
1709
|
if (ids.length === 0)
|
|
1210
1710
|
return;
|
|
1211
|
-
await
|
|
1212
|
-
|
|
1213
|
-
|
|
1711
|
+
await retry(async (attempt) => {
|
|
1712
|
+
if (attempt > 1) {
|
|
1713
|
+
this.log.warn({ attempt, operation: 'qdrant.delete', ids: ids.length }, 'Retrying Qdrant delete');
|
|
1714
|
+
}
|
|
1715
|
+
await this.client.delete(this.collectionName, {
|
|
1716
|
+
wait: true,
|
|
1717
|
+
points: ids,
|
|
1718
|
+
});
|
|
1719
|
+
}, {
|
|
1720
|
+
attempts: 5,
|
|
1721
|
+
baseDelayMs: 500,
|
|
1722
|
+
maxDelayMs: 10_000,
|
|
1723
|
+
jitter: 0.2,
|
|
1724
|
+
onRetry: ({ attempt, delayMs, error }) => {
|
|
1725
|
+
this.log.warn({ attempt, delayMs, operation: 'qdrant.delete', error }, 'Qdrant delete failed; will retry');
|
|
1726
|
+
},
|
|
1214
1727
|
});
|
|
1215
1728
|
}
|
|
1216
1729
|
/**
|
|
@@ -1310,6 +1823,10 @@
|
|
|
1310
1823
|
}
|
|
1311
1824
|
}
|
|
1312
1825
|
|
|
1826
|
+
/**
|
|
1827
|
+
* @module watcher
|
|
1828
|
+
* Filesystem watcher wrapping chokidar. I/O: watches files/directories for add/change/unlink events, enqueues to processing queue.
|
|
1829
|
+
*/
|
|
1313
1830
|
/**
|
|
1314
1831
|
* Filesystem watcher that maps chokidar events to the processing queue.
|
|
1315
1832
|
*/
|
|
@@ -1376,57 +1893,141 @@
|
|
|
1376
1893
|
}
|
|
1377
1894
|
}
|
|
1378
1895
|
|
|
1896
|
+
/**
|
|
1897
|
+
* @module app/configWatcher
|
|
1898
|
+
* Watches the config file for changes and triggers debounced reload. Isolated I/O wrapper around chokidar.
|
|
1899
|
+
*/
|
|
1900
|
+
/**
|
|
1901
|
+
* Debounced config file watcher.
|
|
1902
|
+
*/
|
|
1903
|
+
class ConfigWatcher {
|
|
1904
|
+
options;
|
|
1905
|
+
watcher;
|
|
1906
|
+
debounce;
|
|
1907
|
+
constructor(options) {
|
|
1908
|
+
this.options = options;
|
|
1909
|
+
}
|
|
1910
|
+
start() {
|
|
1911
|
+
if (!this.options.enabled)
|
|
1912
|
+
return;
|
|
1913
|
+
this.watcher = chokidar.watch(this.options.configPath, {
|
|
1914
|
+
ignoreInitial: true,
|
|
1915
|
+
});
|
|
1916
|
+
this.watcher.on('change', () => {
|
|
1917
|
+
if (this.debounce)
|
|
1918
|
+
clearTimeout(this.debounce);
|
|
1919
|
+
this.debounce = setTimeout(() => {
|
|
1920
|
+
void this.options.onChange();
|
|
1921
|
+
}, this.options.debounceMs);
|
|
1922
|
+
});
|
|
1923
|
+
this.watcher.on('error', (error) => {
|
|
1924
|
+
this.options.logger.error({ error }, 'Config watcher error');
|
|
1925
|
+
});
|
|
1926
|
+
this.options.logger.info({
|
|
1927
|
+
configPath: this.options.configPath,
|
|
1928
|
+
debounceMs: this.options.debounceMs,
|
|
1929
|
+
}, 'Config watcher started');
|
|
1930
|
+
}
|
|
1931
|
+
async stop() {
|
|
1932
|
+
if (this.debounce) {
|
|
1933
|
+
clearTimeout(this.debounce);
|
|
1934
|
+
this.debounce = undefined;
|
|
1935
|
+
}
|
|
1936
|
+
if (this.watcher) {
|
|
1937
|
+
await this.watcher.close();
|
|
1938
|
+
this.watcher = undefined;
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
|
|
1943
|
+
/**
|
|
1944
|
+
* @module app/shutdown
|
|
1945
|
+
* Process signal shutdown orchestration. Installs SIGINT/SIGTERM handlers that invoke a provided async stop function.
|
|
1946
|
+
*/
|
|
1947
|
+
/**
|
|
1948
|
+
* Install process signal handlers.
|
|
1949
|
+
*
|
|
1950
|
+
* @param stop - Async stop function to invoke on shutdown signals.
|
|
1951
|
+
*/
|
|
1952
|
+
function installShutdownHandlers(stop) {
|
|
1953
|
+
const shutdown = async () => {
|
|
1954
|
+
await stop();
|
|
1955
|
+
process.exit(0);
|
|
1956
|
+
};
|
|
1957
|
+
process.on('SIGTERM', () => void shutdown());
|
|
1958
|
+
process.on('SIGINT', () => void shutdown());
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
const defaultFactories = {
|
|
1962
|
+
loadConfig,
|
|
1963
|
+
createLogger,
|
|
1964
|
+
createEmbeddingProvider,
|
|
1965
|
+
createVectorStoreClient: (config, dimensions, logger) => new VectorStoreClient(config, dimensions, logger),
|
|
1966
|
+
compileRules,
|
|
1967
|
+
createDocumentProcessor: (config, embeddingProvider, vectorStore, compiledRules, logger) => new DocumentProcessor(config, embeddingProvider, vectorStore, compiledRules, logger),
|
|
1968
|
+
createEventQueue: (options) => new EventQueue(options),
|
|
1969
|
+
createFileSystemWatcher: (config, queue, processor, logger) => new FileSystemWatcher(config, queue, processor, logger),
|
|
1970
|
+
createApiServer,
|
|
1971
|
+
};
|
|
1379
1972
|
/**
|
|
1380
1973
|
* Main application class that wires together all components.
|
|
1381
1974
|
*/
|
|
1382
1975
|
class JeevesWatcher {
|
|
1383
1976
|
config;
|
|
1384
1977
|
configPath;
|
|
1978
|
+
factories;
|
|
1385
1979
|
logger;
|
|
1386
1980
|
watcher;
|
|
1387
1981
|
queue;
|
|
1388
1982
|
server;
|
|
1389
1983
|
processor;
|
|
1390
1984
|
configWatcher;
|
|
1391
|
-
configDebounce;
|
|
1392
1985
|
/**
|
|
1393
1986
|
* Create a new JeevesWatcher instance.
|
|
1394
1987
|
*
|
|
1395
1988
|
* @param config - The application configuration.
|
|
1396
1989
|
* @param configPath - Optional config file path to watch for changes.
|
|
1990
|
+
* @param factories - Optional component factories (for dependency injection).
|
|
1397
1991
|
*/
|
|
1398
|
-
constructor(config, configPath) {
|
|
1992
|
+
constructor(config, configPath, factories = {}) {
|
|
1399
1993
|
this.config = config;
|
|
1400
1994
|
this.configPath = configPath;
|
|
1995
|
+
this.factories = { ...defaultFactories, ...factories };
|
|
1401
1996
|
}
|
|
1402
1997
|
/**
|
|
1403
1998
|
* Start the watcher, API server, and all components.
|
|
1404
1999
|
*/
|
|
1405
2000
|
async start() {
|
|
1406
|
-
const logger = createLogger(this.config.logging);
|
|
2001
|
+
const logger = this.factories.createLogger(this.config.logging);
|
|
1407
2002
|
this.logger = logger;
|
|
1408
2003
|
let embeddingProvider;
|
|
1409
2004
|
try {
|
|
1410
|
-
embeddingProvider = createEmbeddingProvider(this.config.embedding);
|
|
2005
|
+
embeddingProvider = this.factories.createEmbeddingProvider(this.config.embedding, logger);
|
|
1411
2006
|
}
|
|
1412
2007
|
catch (error) {
|
|
1413
2008
|
logger.fatal({ error }, 'Failed to create embedding provider');
|
|
1414
2009
|
throw error;
|
|
1415
2010
|
}
|
|
1416
|
-
const vectorStore =
|
|
2011
|
+
const vectorStore = this.factories.createVectorStoreClient(this.config.vectorStore, embeddingProvider.dimensions, logger);
|
|
1417
2012
|
await vectorStore.ensureCollection();
|
|
1418
|
-
const compiledRules = compileRules(this.config.inferenceRules ?? []);
|
|
1419
|
-
const
|
|
2013
|
+
const compiledRules = this.factories.compileRules(this.config.inferenceRules ?? []);
|
|
2014
|
+
const processorConfig = {
|
|
2015
|
+
metadataDir: this.config.metadataDir ?? '.jeeves-metadata',
|
|
2016
|
+
chunkSize: this.config.embedding.chunkSize,
|
|
2017
|
+
chunkOverlap: this.config.embedding.chunkOverlap,
|
|
2018
|
+
maps: this.config.maps,
|
|
2019
|
+
};
|
|
2020
|
+
const processor = this.factories.createDocumentProcessor(processorConfig, embeddingProvider, vectorStore, compiledRules, logger);
|
|
1420
2021
|
this.processor = processor;
|
|
1421
|
-
const queue =
|
|
2022
|
+
const queue = this.factories.createEventQueue({
|
|
1422
2023
|
debounceMs: this.config.watch.debounceMs ?? 2000,
|
|
1423
2024
|
concurrency: this.config.embedding.concurrency ?? 5,
|
|
1424
2025
|
rateLimitPerMinute: this.config.embedding.rateLimitPerMinute,
|
|
1425
2026
|
});
|
|
1426
2027
|
this.queue = queue;
|
|
1427
|
-
const watcher =
|
|
2028
|
+
const watcher = this.factories.createFileSystemWatcher(this.config.watch, queue, processor, logger);
|
|
1428
2029
|
this.watcher = watcher;
|
|
1429
|
-
const server = createApiServer({
|
|
2030
|
+
const server = this.factories.createApiServer({
|
|
1430
2031
|
processor,
|
|
1431
2032
|
vectorStore,
|
|
1432
2033
|
embeddingProvider,
|
|
@@ -1437,7 +2038,7 @@
|
|
|
1437
2038
|
this.server = server;
|
|
1438
2039
|
await server.listen({
|
|
1439
2040
|
host: this.config.api?.host ?? '127.0.0.1',
|
|
1440
|
-
port: this.config.api?.port ??
|
|
2041
|
+
port: this.config.api?.port ?? 3456,
|
|
1441
2042
|
});
|
|
1442
2043
|
watcher.start();
|
|
1443
2044
|
this.startConfigWatch();
|
|
@@ -1453,12 +2054,17 @@
|
|
|
1453
2054
|
}
|
|
1454
2055
|
if (this.queue) {
|
|
1455
2056
|
const timeout = this.config.shutdownTimeoutMs ?? 10000;
|
|
1456
|
-
await Promise.race([
|
|
1457
|
-
this.queue.drain(),
|
|
2057
|
+
const drained = await Promise.race([
|
|
2058
|
+
this.queue.drain().then(() => true),
|
|
1458
2059
|
new Promise((resolve) => {
|
|
1459
|
-
setTimeout(
|
|
2060
|
+
setTimeout(() => {
|
|
2061
|
+
resolve(false);
|
|
2062
|
+
}, timeout);
|
|
1460
2063
|
}),
|
|
1461
2064
|
]);
|
|
2065
|
+
if (!drained) {
|
|
2066
|
+
this.logger?.warn({ timeoutMs: timeout }, 'Queue drain timeout hit, forcing shutdown');
|
|
2067
|
+
}
|
|
1462
2068
|
}
|
|
1463
2069
|
if (this.server) {
|
|
1464
2070
|
await this.server.close();
|
|
@@ -1477,28 +2083,18 @@
|
|
|
1477
2083
|
return;
|
|
1478
2084
|
}
|
|
1479
2085
|
const debounceMs = this.config.configWatch?.debounceMs ?? 10000;
|
|
1480
|
-
this.configWatcher =
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
this.configDebounce = setTimeout(() => {
|
|
1487
|
-
void this.reloadConfig();
|
|
1488
|
-
}, debounceMs);
|
|
1489
|
-
});
|
|
1490
|
-
this.configWatcher.on('error', (error) => {
|
|
1491
|
-
logger.error({ error }, 'Config watcher error');
|
|
2086
|
+
this.configWatcher = new ConfigWatcher({
|
|
2087
|
+
configPath: this.configPath,
|
|
2088
|
+
enabled,
|
|
2089
|
+
debounceMs,
|
|
2090
|
+
logger,
|
|
2091
|
+
onChange: async () => this.reloadConfig(),
|
|
1492
2092
|
});
|
|
1493
|
-
|
|
2093
|
+
this.configWatcher.start();
|
|
1494
2094
|
}
|
|
1495
2095
|
async stopConfigWatch() {
|
|
1496
|
-
if (this.configDebounce) {
|
|
1497
|
-
clearTimeout(this.configDebounce);
|
|
1498
|
-
this.configDebounce = undefined;
|
|
1499
|
-
}
|
|
1500
2096
|
if (this.configWatcher) {
|
|
1501
|
-
await this.configWatcher.
|
|
2097
|
+
await this.configWatcher.stop();
|
|
1502
2098
|
this.configWatcher = undefined;
|
|
1503
2099
|
}
|
|
1504
2100
|
}
|
|
@@ -1507,10 +2103,11 @@
|
|
|
1507
2103
|
const processor = this.processor;
|
|
1508
2104
|
if (!logger || !processor || !this.configPath)
|
|
1509
2105
|
return;
|
|
2106
|
+
logger.info({ configPath: this.configPath }, 'Config change detected, reloading...');
|
|
1510
2107
|
try {
|
|
1511
|
-
const newConfig = await loadConfig(this.configPath);
|
|
2108
|
+
const newConfig = await this.factories.loadConfig(this.configPath);
|
|
1512
2109
|
this.config = newConfig;
|
|
1513
|
-
const compiledRules = compileRules(newConfig.inferenceRules ?? []);
|
|
2110
|
+
const compiledRules = this.factories.compileRules(newConfig.inferenceRules ?? []);
|
|
1514
2111
|
processor.updateRules(compiledRules);
|
|
1515
2112
|
logger.info({ configPath: this.configPath, rules: compiledRules.length }, 'Config reloaded');
|
|
1516
2113
|
}
|
|
@@ -1528,12 +2125,7 @@
|
|
|
1528
2125
|
async function startFromConfig(configPath) {
|
|
1529
2126
|
const config = await loadConfig(configPath);
|
|
1530
2127
|
const app = new JeevesWatcher(config, configPath);
|
|
1531
|
-
|
|
1532
|
-
await app.stop();
|
|
1533
|
-
process.exit(0);
|
|
1534
|
-
};
|
|
1535
|
-
process.on('SIGTERM', () => void shutdown());
|
|
1536
|
-
process.on('SIGINT', () => void shutdown());
|
|
2128
|
+
installShutdownHandlers(() => app.stop());
|
|
1537
2129
|
await app.start();
|
|
1538
2130
|
return app;
|
|
1539
2131
|
}
|
|
@@ -1543,20 +2135,28 @@
|
|
|
1543
2135
|
exports.FileSystemWatcher = FileSystemWatcher;
|
|
1544
2136
|
exports.JeevesWatcher = JeevesWatcher;
|
|
1545
2137
|
exports.VectorStoreClient = VectorStoreClient;
|
|
2138
|
+
exports.apiConfigSchema = apiConfigSchema;
|
|
1546
2139
|
exports.applyRules = applyRules;
|
|
1547
2140
|
exports.buildAttributes = buildAttributes;
|
|
1548
2141
|
exports.compileRules = compileRules;
|
|
2142
|
+
exports.configWatchConfigSchema = configWatchConfigSchema;
|
|
1549
2143
|
exports.contentHash = contentHash;
|
|
1550
2144
|
exports.createApiServer = createApiServer;
|
|
1551
2145
|
exports.createEmbeddingProvider = createEmbeddingProvider;
|
|
1552
2146
|
exports.createLogger = createLogger;
|
|
1553
2147
|
exports.deleteMetadata = deleteMetadata;
|
|
2148
|
+
exports.embeddingConfigSchema = embeddingConfigSchema;
|
|
1554
2149
|
exports.extractText = extractText;
|
|
2150
|
+
exports.inferenceRuleSchema = inferenceRuleSchema;
|
|
2151
|
+
exports.jeevesWatcherConfigSchema = jeevesWatcherConfigSchema;
|
|
1555
2152
|
exports.loadConfig = loadConfig;
|
|
2153
|
+
exports.loggingConfigSchema = loggingConfigSchema;
|
|
1556
2154
|
exports.metadataPath = metadataPath;
|
|
1557
2155
|
exports.pointId = pointId;
|
|
1558
2156
|
exports.readMetadata = readMetadata;
|
|
1559
2157
|
exports.startFromConfig = startFromConfig;
|
|
2158
|
+
exports.vectorStoreConfigSchema = vectorStoreConfigSchema;
|
|
2159
|
+
exports.watchConfigSchema = watchConfigSchema;
|
|
1560
2160
|
exports.writeMetadata = writeMetadata;
|
|
1561
2161
|
|
|
1562
|
-
})(this["jeeves-watcher"] = this["jeeves-watcher"] || {}, Fastify,
|
|
2162
|
+
})(this["jeeves-watcher"] = this["jeeves-watcher"] || {}, Fastify, promises, node_path, picomatch, radash, node_crypto, cosmiconfig, zod, jsonmap, googleGenai, pino, uuid, cheerio, yaml, mammoth, Ajv, addFormats, textsplitters, jsClientRest, chokidar);
|