@karmaniverous/jeeves-watcher 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mjs/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import Fastify from 'fastify';
2
2
  import { readdir, stat, rm, readFile, mkdir, writeFile } from 'node:fs/promises';
3
- import { resolve, dirname, join, extname, basename } from 'node:path';
3
+ import { resolve, dirname, join, relative, extname, basename } from 'node:path';
4
4
  import picomatch from 'picomatch';
5
5
  import { omit, get } from 'radash';
6
6
  import { createHash } from 'node:crypto';
@@ -8,6 +8,8 @@ import { cosmiconfig } from 'cosmiconfig';
8
8
  import { z, ZodError } from 'zod';
9
9
  import { jsonMapMapSchema, JsonMap } from '@karmaniverous/jsonmap';
10
10
  import { GoogleGenerativeAIEmbeddings } from '@langchain/google-genai';
11
+ import { existsSync, statSync, readdirSync, readFileSync } from 'node:fs';
12
+ import ignore from 'ignore';
11
13
  import pino from 'pino';
12
14
  import { v5 } from 'uuid';
13
15
  import * as cheerio from 'cheerio';
@@ -433,6 +435,7 @@ const WATCH_DEFAULTS = {
433
435
  stabilityThresholdMs: 500,
434
436
  usePolling: false,
435
437
  pollIntervalMs: 1000,
438
+ respectGitignore: true,
436
439
  };
437
440
  /** Default embedding configuration. */
438
441
  const EMBEDDING_DEFAULTS = {
@@ -477,6 +480,11 @@ const watchConfigSchema = z.object({
477
480
  .number()
478
481
  .optional()
479
482
  .describe('Time in milliseconds a file must remain unchanged before processing.'),
483
+ /** Whether to respect .gitignore files when processing. */
484
+ respectGitignore: z
485
+ .boolean()
486
+ .optional()
487
+ .describe('Skip files ignored by .gitignore in git repositories. Only applies to repos with a .git directory. Default: true.'),
480
488
  });
481
489
  /**
482
490
  * Configuration watch settings.
@@ -942,6 +950,212 @@ function createEmbeddingProvider(config, logger) {
942
950
  return factory(config, logger);
943
951
  }
944
952
 
953
+ /**
954
+ * @module gitignore
955
+ * Processor-level gitignore filtering. Scans watched paths for `.gitignore` files in git repos, caches parsed patterns, and exposes `isIgnored()` for path checking.
956
+ */
957
+ /**
958
+ * Find the git repo root by walking up from `startDir` looking for `.git/`.
959
+ * Returns `undefined` if no repo is found.
960
+ */
961
+ function findRepoRoot(startDir) {
962
+ let dir = resolve(startDir);
963
+ const root = resolve('/');
964
+ while (dir !== root) {
965
+ if (existsSync(join(dir, '.git')) &&
966
+ statSync(join(dir, '.git')).isDirectory()) {
967
+ return dir;
968
+ }
969
+ const parent = dirname(dir);
970
+ if (parent === dir)
971
+ break;
972
+ dir = parent;
973
+ }
974
+ return undefined;
975
+ }
976
+ /**
977
+ * Convert a watch path (directory, file path, or glob) to a concrete directory
978
+ * that can be scanned for a repo root.
979
+ */
980
+ function watchPathToScanDir(watchPath) {
981
+ const absPath = resolve(watchPath);
982
+ try {
983
+ return statSync(absPath).isDirectory() ? absPath : dirname(absPath);
984
+ }
985
+ catch {
986
+ // ignore
987
+ }
988
+ // If this is a glob, fall back to the non-glob prefix.
989
+ const globMatch = /[*?[{]/.exec(watchPath);
990
+ if (!globMatch)
991
+ return undefined;
992
+ const prefix = watchPath.slice(0, globMatch.index);
993
+ const trimmed = prefix.trim();
994
+ const baseDir = trimmed.length === 0
995
+ ? '.'
996
+ : trimmed.endsWith('/') || trimmed.endsWith('\\')
997
+ ? trimmed
998
+ : dirname(trimmed);
999
+ const resolved = resolve(baseDir);
1000
+ if (!existsSync(resolved))
1001
+ return undefined;
1002
+ return resolved;
1003
+ }
1004
+ /**
1005
+ * Recursively find all `.gitignore` files under `dir`.
1006
+ * Skips `.git` and `node_modules` directories for performance.
1007
+ */
1008
+ function findGitignoreFiles(dir) {
1009
+ const results = [];
1010
+ const gitignorePath = join(dir, '.gitignore');
1011
+ if (existsSync(gitignorePath)) {
1012
+ results.push(gitignorePath);
1013
+ }
1014
+ let entries;
1015
+ try {
1016
+ entries = readdirSync(dir);
1017
+ }
1018
+ catch {
1019
+ return results;
1020
+ }
1021
+ for (const entry of entries) {
1022
+ if (entry === '.git' || entry === 'node_modules')
1023
+ continue;
1024
+ const fullPath = join(dir, entry);
1025
+ try {
1026
+ if (statSync(fullPath).isDirectory()) {
1027
+ results.push(...findGitignoreFiles(fullPath));
1028
+ }
1029
+ }
1030
+ catch {
1031
+ // Skip inaccessible entries
1032
+ }
1033
+ }
1034
+ return results;
1035
+ }
1036
+ /**
1037
+ * Parse a `.gitignore` file into an `ignore` instance.
1038
+ */
1039
+ function parseGitignore(gitignorePath) {
1040
+ const content = readFileSync(gitignorePath, 'utf8');
1041
+ return ignore().add(content);
1042
+ }
1043
+ /**
1044
+ * Normalize a path to use forward slashes (required by `ignore` package).
1045
+ */
1046
+ function toForwardSlash(p) {
1047
+ return p.replace(/\\/g, '/');
1048
+ }
1049
+ /**
1050
+ * Processor-level gitignore filter. Checks file paths against the nearest
1051
+ * `.gitignore` chain in git repositories.
1052
+ */
1053
+ class GitignoreFilter {
1054
+ repos = new Map();
1055
+ /**
1056
+ * Create a GitignoreFilter by scanning watched paths for `.gitignore` files.
1057
+ *
1058
+ * @param watchPaths - Absolute paths being watched (directories or globs resolved to roots).
1059
+ */
1060
+ constructor(watchPaths) {
1061
+ this.scan(watchPaths);
1062
+ }
1063
+ /**
1064
+ * Scan paths for git repos and their `.gitignore` files.
1065
+ */
1066
+ scan(watchPaths) {
1067
+ this.repos.clear();
1068
+ const scannedDirs = new Set();
1069
+ for (const watchPath of watchPaths) {
1070
+ const scanDir = watchPathToScanDir(watchPath);
1071
+ if (!scanDir)
1072
+ continue;
1073
+ if (scannedDirs.has(scanDir))
1074
+ continue;
1075
+ scannedDirs.add(scanDir);
1076
+ const repoRoot = findRepoRoot(scanDir);
1077
+ if (!repoRoot)
1078
+ continue;
1079
+ if (this.repos.has(repoRoot))
1080
+ continue;
1081
+ const gitignoreFiles = findGitignoreFiles(repoRoot);
1082
+ const entries = gitignoreFiles.map((gf) => ({
1083
+ dir: dirname(gf),
1084
+ ig: parseGitignore(gf),
1085
+ }));
1086
+ // Sort deepest-first so nested `.gitignore` files are checked first
1087
+ entries.sort((a, b) => b.dir.length - a.dir.length);
1088
+ this.repos.set(repoRoot, { root: repoRoot, entries });
1089
+ }
1090
+ }
1091
+ /**
1092
+ * Check whether a file path is ignored by any applicable `.gitignore`.
1093
+ *
1094
+ * @param filePath - Absolute file path to check.
1095
+ * @returns `true` if the file should be ignored.
1096
+ */
1097
+ isIgnored(filePath) {
1098
+ const absPath = resolve(filePath);
1099
+ for (const [, repo] of this.repos) {
1100
+ // Check if file is within this repo
1101
+ const relToRepo = relative(repo.root, absPath);
1102
+ if (relToRepo.startsWith('..') || relToRepo.startsWith(resolve('/'))) {
1103
+ continue;
1104
+ }
1105
+ // Check each `.gitignore` entry (deepest-first)
1106
+ for (const entry of repo.entries) {
1107
+ const relToEntry = relative(entry.dir, absPath);
1108
+ if (relToEntry.startsWith('..'))
1109
+ continue;
1110
+ const normalized = toForwardSlash(relToEntry);
1111
+ if (entry.ig.ignores(normalized)) {
1112
+ return true;
1113
+ }
1114
+ }
1115
+ }
1116
+ return false;
1117
+ }
1118
+ /**
1119
+ * Invalidate and re-parse a specific `.gitignore` file.
1120
+ * Call when a `.gitignore` file is added, changed, or removed.
1121
+ *
1122
+ * @param gitignorePath - Absolute path to the `.gitignore` file that changed.
1123
+ */
1124
+ invalidate(gitignorePath) {
1125
+ const absPath = resolve(gitignorePath);
1126
+ const gitignoreDir = dirname(absPath);
1127
+ for (const [, repo] of this.repos) {
1128
+ const relToRepo = relative(repo.root, gitignoreDir);
1129
+ if (relToRepo.startsWith('..'))
1130
+ continue;
1131
+ // Remove old entry for this directory
1132
+ repo.entries = repo.entries.filter((e) => e.dir !== gitignoreDir);
1133
+ // Re-parse if file still exists
1134
+ if (existsSync(absPath)) {
1135
+ repo.entries.push({ dir: gitignoreDir, ig: parseGitignore(absPath) });
1136
+ // Re-sort deepest-first
1137
+ repo.entries.sort((a, b) => b.dir.length - a.dir.length);
1138
+ }
1139
+ return;
1140
+ }
1141
+ // If not in any known repo, check if it's in a repo we haven't scanned
1142
+ const repoRoot = findRepoRoot(gitignoreDir);
1143
+ if (repoRoot && existsSync(absPath)) {
1144
+ const entries = [
1145
+ { dir: gitignoreDir, ig: parseGitignore(absPath) },
1146
+ ];
1147
+ if (this.repos.has(repoRoot)) {
1148
+ const repo = this.repos.get(repoRoot);
1149
+ repo.entries.push(entries[0]);
1150
+ repo.entries.sort((a, b) => b.dir.length - a.dir.length);
1151
+ }
1152
+ else {
1153
+ this.repos.set(repoRoot, { root: repoRoot, entries });
1154
+ }
1155
+ }
1156
+ }
1157
+ }
1158
+
945
1159
  /**
946
1160
  * @module logger
947
1161
  * Creates pino logger instances. I/O: optionally writes logs to file via pino/file transport. Defaults to stdout at info level.
@@ -2033,6 +2247,7 @@ class FileSystemWatcher {
2033
2247
  processor;
2034
2248
  logger;
2035
2249
  health;
2250
+ gitignoreFilter;
2036
2251
  watcher;
2037
2252
  /**
2038
2253
  * Create a new FileSystemWatcher.
@@ -2048,6 +2263,7 @@ class FileSystemWatcher {
2048
2263
  this.queue = queue;
2049
2264
  this.processor = processor;
2050
2265
  this.logger = logger;
2266
+ this.gitignoreFilter = options.gitignoreFilter;
2051
2267
  const healthOptions = {
2052
2268
  maxRetries: options.maxRetries,
2053
2269
  maxBackoffMs: options.maxBackoffMs,
@@ -2070,14 +2286,23 @@ class FileSystemWatcher {
2070
2286
  ignoreInitial: false,
2071
2287
  });
2072
2288
  this.watcher.on('add', (path) => {
2289
+ this.handleGitignoreChange(path);
2290
+ if (this.isGitignored(path))
2291
+ return;
2073
2292
  this.logger.debug({ path }, 'File added');
2074
2293
  this.queue.enqueue({ type: 'create', path, priority: 'normal' }, () => this.wrapProcessing(() => this.processor.processFile(path)));
2075
2294
  });
2076
2295
  this.watcher.on('change', (path) => {
2296
+ this.handleGitignoreChange(path);
2297
+ if (this.isGitignored(path))
2298
+ return;
2077
2299
  this.logger.debug({ path }, 'File changed');
2078
2300
  this.queue.enqueue({ type: 'modify', path, priority: 'normal' }, () => this.wrapProcessing(() => this.processor.processFile(path)));
2079
2301
  });
2080
2302
  this.watcher.on('unlink', (path) => {
2303
+ this.handleGitignoreChange(path);
2304
+ if (this.isGitignored(path))
2305
+ return;
2081
2306
  this.logger.debug({ path }, 'File removed');
2082
2307
  this.queue.enqueue({ type: 'delete', path, priority: 'normal' }, () => this.wrapProcessing(() => this.processor.deleteFile(path)));
2083
2308
  });
@@ -2104,6 +2329,29 @@ class FileSystemWatcher {
2104
2329
  get systemHealth() {
2105
2330
  return this.health;
2106
2331
  }
2332
+ /**
2333
+ * Check if a path is gitignored and should be skipped.
2334
+ */
2335
+ isGitignored(path) {
2336
+ if (!this.gitignoreFilter)
2337
+ return false;
2338
+ const ignored = this.gitignoreFilter.isIgnored(path);
2339
+ if (ignored) {
2340
+ this.logger.debug({ path }, 'Skipping gitignored file');
2341
+ }
2342
+ return ignored;
2343
+ }
2344
+ /**
2345
+ * If the changed file is a `.gitignore`, invalidate the filter cache.
2346
+ */
2347
+ handleGitignoreChange(path) {
2348
+ if (!this.gitignoreFilter)
2349
+ return;
2350
+ if (path.endsWith('.gitignore')) {
2351
+ this.logger.info({ path }, 'Gitignore file changed, refreshing filter');
2352
+ this.gitignoreFilter.invalidate(path);
2353
+ }
2354
+ }
2107
2355
  /**
2108
2356
  * Wrap a processing operation with health tracking.
2109
2357
  * On success, resets the failure counter.
@@ -2259,10 +2507,15 @@ class JeevesWatcher {
2259
2507
  rateLimitPerMinute: this.config.embedding.rateLimitPerMinute,
2260
2508
  });
2261
2509
  this.queue = queue;
2510
+ const respectGitignore = this.config.watch.respectGitignore ?? true;
2511
+ const gitignoreFilter = respectGitignore
2512
+ ? new GitignoreFilter(this.config.watch.paths)
2513
+ : undefined;
2262
2514
  const watcher = this.factories.createFileSystemWatcher(this.config.watch, queue, processor, logger, {
2263
2515
  maxRetries: this.config.maxRetries,
2264
2516
  maxBackoffMs: this.config.maxBackoffMs,
2265
2517
  onFatalError: this.runtimeOptions.onFatalError,
2518
+ gitignoreFilter,
2266
2519
  });
2267
2520
  this.watcher = watcher;
2268
2521
  const server = this.factories.createApiServer({
@@ -2368,4 +2621,4 @@ async function startFromConfig(configPath) {
2368
2621
  return app;
2369
2622
  }
2370
2623
 
2371
- export { DocumentProcessor, EventQueue, FileSystemWatcher, JeevesWatcher, SystemHealth, VectorStoreClient, apiConfigSchema, applyRules, buildAttributes, compileRules, configWatchConfigSchema, contentHash, createApiServer, createEmbeddingProvider, createLogger, deleteMetadata, embeddingConfigSchema, extractText, inferenceRuleSchema, jeevesWatcherConfigSchema, loadConfig, loggingConfigSchema, metadataPath, pointId, readMetadata, startFromConfig, vectorStoreConfigSchema, watchConfigSchema, writeMetadata };
2624
+ export { DocumentProcessor, EventQueue, FileSystemWatcher, GitignoreFilter, JeevesWatcher, SystemHealth, VectorStoreClient, apiConfigSchema, applyRules, buildAttributes, compileRules, configWatchConfigSchema, contentHash, createApiServer, createEmbeddingProvider, createLogger, deleteMetadata, embeddingConfigSchema, extractText, inferenceRuleSchema, jeevesWatcherConfigSchema, loadConfig, loggingConfigSchema, metadataPath, pointId, readMetadata, startFromConfig, vectorStoreConfigSchema, watchConfigSchema, writeMetadata };
package/package.json CHANGED
@@ -26,6 +26,7 @@
26
26
  "commander": "^14.0.3",
27
27
  "cosmiconfig": "*",
28
28
  "fastify": "*",
29
+ "ignore": "^7.0.5",
29
30
  "js-yaml": "*",
30
31
  "json5": "*",
31
32
  "mammoth": "^1.11.0",
@@ -171,5 +172,5 @@
171
172
  },
172
173
  "type": "module",
173
174
  "types": "dist/index.d.ts",
174
- "version": "0.2.4"
175
+ "version": "0.2.6"
175
176
  }