npm - raggrep - Versions diffs - 0.8.0 → 0.8.1 - Mend

raggrep 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cli/main.js +167 -138
package/dist/cli/main.js.map +6 -5
package/dist/domain/services/index.d.ts +1 -0
package/dist/domain/services/jsonPathExtractor.d.ts +29 -0
package/dist/domain/services/jsonPathExtractor.test.d.ts +4 -0
package/dist/index.js +166 -137
package/dist/index.js.map +6 -5
package/dist/modules/data/json/index.d.ts +28 -10
package/package.json +1 -1

package/dist/domain/services/index.d.ts CHANGED Viewed

@@ -13,3 +13,4 @@ export { parseQueryLiterals } from "./queryLiteralParser";
 export { extractLiterals, extractLiteralsWithReferences, } from "./literalExtractor";
 export { calculateLiteralMultiplier, calculateMaxMultiplier, calculateLiteralContribution, applyLiteralBoost, mergeWithLiteralBoost, LITERAL_SCORING_CONSTANTS, type LiteralScoreContribution, type MergeInput, type MergeOutput, } from "./literalScorer";
 export { getSynonyms, expandQuery, DEFAULT_LEXICON, EXPANSION_WEIGHTS, DEFAULT_EXPANSION_OPTIONS, } from "./lexicon";
+export { extractJsonPaths, extractJsonKeywords } from "./jsonPathExtractor";

package/dist/domain/services/jsonPathExtractor.d.ts ADDED Viewed

@@ -0,0 +1,29 @@
+/**
+ * JSON Path Extractor
+ *
+ * Extracts dot-notation key paths from JSON objects as literals.
+ * Used for literal-based indexing of JSON files.
+ *
+ * @example
+ * // user.json: { name: { first: "john" } }
+ * extractJsonPaths({ name: { first: "john" } }, "user")
+ * // Returns literals for: "user.name", "user.name.first"
+ */
+import type { ExtractedLiteral } from "../entities/literal";
+/**
+ * Extract all key paths from a JSON object as literals.
+ * Prefixes all paths with the filename (without extension).
+ *
+ * @param obj - Parsed JSON object
+ * @param fileBasename - Filename without extension (e.g., "user" from "user.json")
+ * @returns Array of literals representing all dot-notation paths
+ */
+export declare function extractJsonPaths(obj: unknown, fileBasename: string): ExtractedLiteral[];
+/**
+ * Extract keywords from JSON for BM25 indexing.
+ * Extracts both keys and string values.
+ *
+ * @param obj - Parsed JSON object
+ * @returns Array of keywords for BM25 indexing
+ */
+export declare function extractJsonKeywords(obj: unknown): string[];

package/dist/domain/services/jsonPathExtractor.test.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+/**
+ * JSON Path Extractor Tests
+ */
+export {};

package/dist/index.js CHANGED Viewed

@@ -2548,44 +2548,10 @@ var init_queryIntent = __esm(() => {
 });
 // src/domain/services/chunking.ts
-function createLineBasedChunks(content, options = {}) {
-  const {
-    chunkSize = DEFAULT_CHUNK_SIZE,
-    overlap = DEFAULT_OVERLAP,
-    minLinesForMultipleChunks = chunkSize
-  } = options;
-  const lines = content.split(`
-`);
-  const chunks = [];
-  if (lines.length <= minLinesForMultipleChunks) {
-    return [
-      {
-        content,
-        startLine: 1,
-        endLine: lines.length,
-        type: "file"
-      }
-    ];
-  }
-  for (let i = 0;i < lines.length; i += chunkSize - overlap) {
-    const endIdx = Math.min(i + chunkSize, lines.length);
-    chunks.push({
-      content: lines.slice(i, endIdx).join(`
-`),
-      startLine: i + 1,
-      endLine: endIdx,
-      type: "block"
-    });
-    if (endIdx >= lines.length)
-      break;
-  }
-  return chunks;
-}
 function generateChunkId(filepath, startLine, endLine) {
   const safePath = filepath.replace(/[/\\]/g, "-").replace(/\./g, "_");
   return `${safePath}-${startLine}-${endLine}`;
 }
-var DEFAULT_CHUNK_SIZE = 30, DEFAULT_OVERLAP = 5;
 // src/domain/services/queryLiteralParser.ts
 function parseQueryLiterals(query) {
@@ -3453,6 +3419,63 @@ var init_lexicon2 = __esm(() => {
   defaultLookupMap = buildLookupMap(DEFAULT_LEXICON);
 });
+// src/domain/services/jsonPathExtractor.ts
+function extractJsonPaths(obj, fileBasename) {
+  const paths = extractPathsRecursive(obj, fileBasename);
+  return paths.map((path8) => ({
+    value: path8,
+    type: "identifier",
+    matchType: "definition"
+  }));
+}
+function extractPathsRecursive(obj, prefix) {
+  const paths = [];
+  if (obj === null || obj === undefined) {
+    return paths;
+  }
+  if (Array.isArray(obj)) {
+    obj.forEach((item, index) => {
+      const indexedPrefix = `${prefix}[${index}]`;
+      paths.push(indexedPrefix);
+      if (item !== null && typeof item === "object") {
+        paths.push(...extractPathsRecursive(item, indexedPrefix));
+      }
+    });
+  } else if (typeof obj === "object") {
+    for (const [key, value] of Object.entries(obj)) {
+      const fullPath = `${prefix}.${key}`;
+      paths.push(fullPath);
+      if (value !== null && typeof value === "object") {
+        paths.push(...extractPathsRecursive(value, fullPath));
+      }
+    }
+  }
+  return paths;
+}
+function extractJsonKeywords(obj) {
+  const keywords = new Set;
+  const extract = (value, parentKey) => {
+    if (value === null || value === undefined) {
+      return;
+    }
+    if (typeof value === "string") {
+      const words = value.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/[\s_\-./]+/).filter((w) => w.length > 2);
+      words.forEach((w) => keywords.add(w));
+    } else if (Array.isArray(value)) {
+      value.forEach((item) => extract(item));
+    } else if (typeof value === "object") {
+      for (const [key, val] of Object.entries(value)) {
+        keywords.add(key.toLowerCase());
+        const keyWords = key.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/[\s_\-]+/).filter((w) => w.length > 2);
+        keyWords.forEach((w) => keywords.add(w));
+        extract(val, key);
+      }
+    }
+  };
+  extract(obj);
+  return Array.from(keywords);
+}
 // src/domain/services/index.ts
 var init_services = __esm(() => {
   init_keywords();
@@ -4383,113 +4406,66 @@ function isJsonFile(filepath) {
   const ext = path11.extname(filepath).toLowerCase();
   return JSON_EXTENSIONS.includes(ext);
 }
-function extractJsonKeys(obj, prefix = "") {
-  const keys = [];
-  if (obj === null || obj === undefined) {
-    return keys;
-  }
-  if (Array.isArray(obj)) {
-    obj.forEach((item, index) => {
-      keys.push(...extractJsonKeys(item, `${prefix}[${index}]`));
-    });
-  } else if (typeof obj === "object") {
-    for (const [key, value] of Object.entries(obj)) {
-      const fullKey = prefix ? `${prefix}.${key}` : key;
-      keys.push(key);
-      keys.push(...extractJsonKeys(value, fullKey));
-    }
-  }
-  return keys;
-}
-function extractJsonKeywords(content) {
-  try {
-    const parsed = JSON.parse(content);
-    const keys = extractJsonKeys(parsed);
-    const stringValues = [];
-    const extractStrings = (obj) => {
-      if (typeof obj === "string") {
-        const words = obj.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/\s+/).filter((w) => w.length > 2);
-        stringValues.push(...words);
-      } else if (Array.isArray(obj)) {
-        obj.forEach(extractStrings);
-      } else if (obj && typeof obj === "object") {
-        Object.values(obj).forEach(extractStrings);
-      }
-    };
-    extractStrings(parsed);
-    return [...new Set([...keys, ...stringValues])];
-  } catch {
-    return [];
-  }
-}
 class JsonModule {
   id = "data/json";
   name = "JSON Search";
-  description = "JSON file search with structure-aware indexing";
-  version = "1.0.0";
+  description = "JSON file search with literal-based key path indexing";
+  version = "2.0.0";
   supportsFile(filepath) {
     return isJsonFile(filepath);
   }
-  embeddingConfig = null;
   symbolicIndex = null;
+  literalIndex = null;
   pendingSummaries = new Map;
+  pendingLiterals = new Map;
   rootDir = "";
   logger = undefined;
   async initialize(config) {
-    this.embeddingConfig = getEmbeddingConfigFromModule(config);
     this.logger = config.options?.logger;
-    if (this.logger) {
-      this.embeddingConfig = {
-        ...this.embeddingConfig,
-        logger: this.logger
-      };
-    }
-    configureEmbeddings(this.embeddingConfig);
     this.pendingSummaries.clear();
+    this.pendingLiterals.clear();
   }
   async indexFile(filepath, content, ctx) {
     if (!isJsonFile(filepath)) {
       return null;
     }
     this.rootDir = ctx.rootDir;
-    const textChunks = createLineBasedChunks(content, {
-      chunkSize: 50,
-      overlap: 10
-    });
-    if (textChunks.length === 0) {
+    let parsed;
+    try {
+      parsed = JSON.parse(content);
+    } catch {
       return null;
     }
-    const chunkContents = textChunks.map((c) => {
-      const filename = path11.basename(filepath);
-      return `${filename}: ${c.content}`;
-    });
-    const embeddings = await getEmbeddings(chunkContents);
-    const chunks = textChunks.map((tc, i) => ({
-      id: generateChunkId(filepath, tc.startLine, tc.endLine),
-      content: tc.content,
-      startLine: tc.startLine,
-      endLine: tc.endLine,
-      type: tc.type
-    }));
-    const jsonKeys = extractJsonKeys((() => {
-      try {
-        return JSON.parse(content);
-      } catch {
-        return {};
+    const fileBasename = path11.basename(filepath, path11.extname(filepath));
+    const jsonPathLiterals = extractJsonPaths(parsed, fileBasename);
+    const lines = content.split(`
+`);
+    const lineCount = lines.length;
+    const chunkId = generateChunkId(filepath, 1, lineCount);
+    const chunks = [
+      {
+        id: chunkId,
+        content,
+        startLine: 1,
+        endLine: lineCount,
+        type: "file"
       }
-    })());
+    ];
+    if (jsonPathLiterals.length > 0) {
+      this.pendingLiterals.set(chunkId, {
+        filepath,
+        literals: jsonPathLiterals
+      });
+    }
     const stats = await ctx.getFileStats(filepath);
-    const currentConfig = getEmbeddingConfig();
     const moduleData = {
-      embeddings,
-      embeddingModel: currentConfig.model,
-      jsonKeys
+      jsonPaths: jsonPathLiterals.map((l) => l.value)
     };
-    const keywords = extractJsonKeywords(content);
+    const keywords = extractJsonKeywords(parsed);
     const fileSummary = {
       filepath,
-      chunkCount: chunks.length,
+      chunkCount: 1,
       chunkTypes: ["file"],
       keywords,
       exports: [],
@@ -4512,7 +4488,24 @@ class JsonModule {
     }
     this.symbolicIndex.buildBM25Index();
     await this.symbolicIndex.save();
+    this.literalIndex = new LiteralIndex(indexDir, this.id);
+    await this.literalIndex.initialize();
+    const indexedFilepaths = new Set;
+    for (const filepath of this.pendingSummaries.keys()) {
+      indexedFilepaths.add(filepath);
+    }
+    for (const { filepath } of this.pendingLiterals.values()) {
+      indexedFilepaths.add(filepath);
+    }
+    for (const filepath of indexedFilepaths) {
+      this.literalIndex.removeFile(filepath);
+    }
+    for (const [chunkId, { filepath, literals }] of this.pendingLiterals) {
+      this.literalIndex.addLiterals(chunkId, filepath, literals);
+    }
+    await this.literalIndex.save();
     this.pendingSummaries.clear();
+    this.pendingLiterals.clear();
   }
   async search(query, ctx, options = {}) {
     const {
@@ -4520,8 +4513,15 @@ class JsonModule {
       minScore = DEFAULT_MIN_SCORE3,
       filePatterns
     } = options;
+    const { literals: queryLiterals, remainingQuery } = parseQueryLiterals(query);
     const indexDir = getRaggrepDir(ctx.rootDir, ctx.config);
     const symbolicIndex = new SymbolicIndex(indexDir, this.id);
+    const literalIndex = new LiteralIndex(indexDir, this.id);
+    let literalMatchMap = new Map;
+    try {
+      await literalIndex.initialize();
+      literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
+    } catch {}
     let allFiles;
     try {
       await symbolicIndex.initialize();
@@ -4541,25 +4541,16 @@ class JsonModule {
         });
       });
     }
-    const queryEmbedding = await getEmbedding(query);
     const bm25Index = new BM25Index;
     const allChunksData = [];
     for (const filepath of filesToSearch) {
       const fileIndex = await ctx.loadFileIndex(filepath);
       if (!fileIndex)
         continue;
-      const moduleData = fileIndex.moduleData;
-      if (!moduleData?.embeddings)
-        continue;
-      for (let i = 0;i < fileIndex.chunks.length; i++) {
-        const chunk = fileIndex.chunks[i];
-        const embedding = moduleData.embeddings[i];
-        if (!embedding)
-          continue;
+      for (const chunk of fileIndex.chunks) {
         allChunksData.push({
           filepath: fileIndex.filepath,
-          chunk,
-          embedding
+          chunk
         });
         bm25Index.addDocuments([{ id: chunk.id, content: chunk.content }]);
       }
@@ -4569,32 +4560,70 @@ class JsonModule {
     for (const result of bm25Results) {
       bm25Scores.set(result.id, normalizeScore(result.score, 3));
     }
-    const queryTerms = extractQueryTerms(query);
     const results = [];
-    for (const { filepath, chunk, embedding } of allChunksData) {
-      const semanticScore = cosineSimilarity(queryEmbedding, embedding);
+    const processedChunkIds = new Set;
+    for (const { filepath, chunk } of allChunksData) {
       const bm25Score = bm25Scores.get(chunk.id) || 0;
-      const hybridScore = SEMANTIC_WEIGHT2 * semanticScore + BM25_WEIGHT2 * bm25Score;
-      if (hybridScore >= minScore || bm25Score > 0.3) {
+      const literalMatches = literalMatchMap.get(chunk.id) || [];
+      const literalContribution = calculateLiteralContribution(literalMatches, bm25Score > 0);
+      const baseScore = BM25_WEIGHT2 * bm25Score;
+      const boostedScore = applyLiteralBoost(baseScore, literalMatches, bm25Score > 0);
+      const literalBase = literalMatches.length > 0 && bm25Score === 0 ? LITERAL_SCORING_CONSTANTS.BASE_SCORE * LITERAL_WEIGHT : 0;
+      const finalScore = boostedScore + literalBase;
+      processedChunkIds.add(chunk.id);
+      if (finalScore >= minScore || literalMatches.length > 0) {
         results.push({
           filepath,
           chunk,
-          score: hybridScore,
+          score: finalScore,
           moduleId: this.id,
           context: {
-            semanticScore,
-            bm25Score
+            bm25Score,
+            literalMultiplier: literalContribution.multiplier,
+            literalMatchType: literalContribution.bestMatchType,
+            literalConfidence: literalContribution.bestConfidence,
+            literalMatchCount: literalContribution.matchCount
           }
         });
       }
     }
+    for (const [chunkId, matches] of literalMatchMap) {
+      if (processedChunkIds.has(chunkId)) {
+        continue;
+      }
+      const filepath = matches[0]?.filepath;
+      if (!filepath)
+        continue;
+      const fileIndex = await ctx.loadFileIndex(filepath);
+      if (!fileIndex)
+        continue;
+      const chunk = fileIndex.chunks.find((c) => c.id === chunkId);
+      if (!chunk)
+        continue;
+      const literalContribution = calculateLiteralContribution(matches, false);
+      const score = LITERAL_SCORING_CONSTANTS.BASE_SCORE * literalContribution.multiplier;
+      processedChunkIds.add(chunkId);
+      results.push({
+        filepath,
+        chunk,
+        score,
+        moduleId: this.id,
+        context: {
+          bm25Score: 0,
+          literalMultiplier: literalContribution.multiplier,
+          literalMatchType: literalContribution.bestMatchType,
+          literalConfidence: literalContribution.bestConfidence,
+          literalMatchCount: literalContribution.matchCount,
+          literalOnly: true
+        }
+      });
+    }
     results.sort((a, b) => b.score - a.score);
     return results.slice(0, topK);
   }
 }
-var DEFAULT_MIN_SCORE3 = 0.15, DEFAULT_TOP_K3 = 10, SEMANTIC_WEIGHT2 = 0.7, BM25_WEIGHT2 = 0.3, JSON_EXTENSIONS, supportsFile2;
+var DEFAULT_MIN_SCORE3 = 0.1, DEFAULT_TOP_K3 = 10, BM25_WEIGHT2 = 0.4, LITERAL_WEIGHT = 0.6, JSON_EXTENSIONS, supportsFile2;
 var init_json = __esm(() => {
-  init_embeddings();
   init_services();
   init_config2();
   init_storage();
@@ -4864,7 +4893,7 @@ ${section.content}` : section.content,
       ].includes(t))) {
         docBoost = 0.05;
       }
-      const hybridScore = SEMANTIC_WEIGHT3 * semanticScore + BM25_WEIGHT3 * bm25Score + docBoost;
+      const hybridScore = SEMANTIC_WEIGHT2 * semanticScore + BM25_WEIGHT3 * bm25Score + docBoost;
       if (hybridScore >= minScore || bm25Score > 0.3) {
         results.push({
           filepath,
@@ -4883,7 +4912,7 @@ ${section.content}` : section.content,
     return results.slice(0, topK);
   }
 }
-var DEFAULT_MIN_SCORE4 = 0.15, DEFAULT_TOP_K4 = 10, SEMANTIC_WEIGHT3 = 0.7, BM25_WEIGHT3 = 0.3, MARKDOWN_EXTENSIONS, supportsFile3;
+var DEFAULT_MIN_SCORE4 = 0.15, DEFAULT_TOP_K4 = 10, SEMANTIC_WEIGHT2 = 0.7, BM25_WEIGHT3 = 0.3, MARKDOWN_EXTENSIONS, supportsFile3;
 var init_markdown = __esm(() => {
   init_embeddings();
   init_services();
@@ -6058,4 +6087,4 @@ export {
   ConsoleLogger
 };
-//# debugId=59B4DA12592C31BA64756E2164756E21
+//# debugId=7A45B6717CB7C82E64756E2164756E21