raggrep 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/main.js CHANGED
@@ -2642,44 +2642,10 @@ var init_queryIntent = __esm(() => {
2642
2642
  });
2643
2643
 
2644
2644
  // src/domain/services/chunking.ts
2645
- function createLineBasedChunks(content, options = {}) {
2646
- const {
2647
- chunkSize = DEFAULT_CHUNK_SIZE,
2648
- overlap = DEFAULT_OVERLAP,
2649
- minLinesForMultipleChunks = chunkSize
2650
- } = options;
2651
- const lines = content.split(`
2652
- `);
2653
- const chunks = [];
2654
- if (lines.length <= minLinesForMultipleChunks) {
2655
- return [
2656
- {
2657
- content,
2658
- startLine: 1,
2659
- endLine: lines.length,
2660
- type: "file"
2661
- }
2662
- ];
2663
- }
2664
- for (let i = 0;i < lines.length; i += chunkSize - overlap) {
2665
- const endIdx = Math.min(i + chunkSize, lines.length);
2666
- chunks.push({
2667
- content: lines.slice(i, endIdx).join(`
2668
- `),
2669
- startLine: i + 1,
2670
- endLine: endIdx,
2671
- type: "block"
2672
- });
2673
- if (endIdx >= lines.length)
2674
- break;
2675
- }
2676
- return chunks;
2677
- }
2678
2645
  function generateChunkId(filepath, startLine, endLine) {
2679
2646
  const safePath = filepath.replace(/[/\\]/g, "-").replace(/\./g, "_");
2680
2647
  return `${safePath}-${startLine}-${endLine}`;
2681
2648
  }
2682
- var DEFAULT_CHUNK_SIZE = 30, DEFAULT_OVERLAP = 5;
2683
2649
 
2684
2650
  // src/domain/services/queryLiteralParser.ts
2685
2651
  function parseQueryLiterals(query) {
@@ -3547,6 +3513,63 @@ var init_lexicon2 = __esm(() => {
3547
3513
  defaultLookupMap = buildLookupMap(DEFAULT_LEXICON);
3548
3514
  });
3549
3515
 
3516
+ // src/domain/services/jsonPathExtractor.ts
3517
+ function extractJsonPaths(obj, fileBasename) {
3518
+ const paths = extractPathsRecursive(obj, fileBasename);
3519
+ return paths.map((path8) => ({
3520
+ value: path8,
3521
+ type: "identifier",
3522
+ matchType: "definition"
3523
+ }));
3524
+ }
3525
+ function extractPathsRecursive(obj, prefix) {
3526
+ const paths = [];
3527
+ if (obj === null || obj === undefined) {
3528
+ return paths;
3529
+ }
3530
+ if (Array.isArray(obj)) {
3531
+ obj.forEach((item, index) => {
3532
+ const indexedPrefix = `${prefix}[${index}]`;
3533
+ paths.push(indexedPrefix);
3534
+ if (item !== null && typeof item === "object") {
3535
+ paths.push(...extractPathsRecursive(item, indexedPrefix));
3536
+ }
3537
+ });
3538
+ } else if (typeof obj === "object") {
3539
+ for (const [key, value] of Object.entries(obj)) {
3540
+ const fullPath = `${prefix}.${key}`;
3541
+ paths.push(fullPath);
3542
+ if (value !== null && typeof value === "object") {
3543
+ paths.push(...extractPathsRecursive(value, fullPath));
3544
+ }
3545
+ }
3546
+ }
3547
+ return paths;
3548
+ }
3549
+ function extractJsonKeywords(obj) {
3550
+ const keywords = new Set;
3551
+ const extract = (value, parentKey) => {
3552
+ if (value === null || value === undefined) {
3553
+ return;
3554
+ }
3555
+ if (typeof value === "string") {
3556
+ const words = value.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/[\s_\-./]+/).filter((w) => w.length > 2);
3557
+ words.forEach((w) => keywords.add(w));
3558
+ } else if (Array.isArray(value)) {
3559
+ value.forEach((item) => extract(item));
3560
+ } else if (typeof value === "object") {
3561
+ for (const [key, val] of Object.entries(value)) {
3562
+ keywords.add(key.toLowerCase());
3563
+ const keyWords = key.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/[\s_\-]+/).filter((w) => w.length > 2);
3564
+ keyWords.forEach((w) => keywords.add(w));
3565
+ extract(val, key);
3566
+ }
3567
+ }
3568
+ };
3569
+ extract(obj);
3570
+ return Array.from(keywords);
3571
+ }
3572
+
3550
3573
  // src/domain/services/index.ts
3551
3574
  var init_services = __esm(() => {
3552
3575
  init_keywords();
@@ -4477,113 +4500,66 @@ function isJsonFile(filepath) {
4477
4500
  const ext = path11.extname(filepath).toLowerCase();
4478
4501
  return JSON_EXTENSIONS.includes(ext);
4479
4502
  }
4480
- function extractJsonKeys(obj, prefix = "") {
4481
- const keys = [];
4482
- if (obj === null || obj === undefined) {
4483
- return keys;
4484
- }
4485
- if (Array.isArray(obj)) {
4486
- obj.forEach((item, index) => {
4487
- keys.push(...extractJsonKeys(item, `${prefix}[${index}]`));
4488
- });
4489
- } else if (typeof obj === "object") {
4490
- for (const [key, value] of Object.entries(obj)) {
4491
- const fullKey = prefix ? `${prefix}.${key}` : key;
4492
- keys.push(key);
4493
- keys.push(...extractJsonKeys(value, fullKey));
4494
- }
4495
- }
4496
- return keys;
4497
- }
4498
- function extractJsonKeywords(content) {
4499
- try {
4500
- const parsed = JSON.parse(content);
4501
- const keys = extractJsonKeys(parsed);
4502
- const stringValues = [];
4503
- const extractStrings = (obj) => {
4504
- if (typeof obj === "string") {
4505
- const words = obj.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/\s+/).filter((w) => w.length > 2);
4506
- stringValues.push(...words);
4507
- } else if (Array.isArray(obj)) {
4508
- obj.forEach(extractStrings);
4509
- } else if (obj && typeof obj === "object") {
4510
- Object.values(obj).forEach(extractStrings);
4511
- }
4512
- };
4513
- extractStrings(parsed);
4514
- return [...new Set([...keys, ...stringValues])];
4515
- } catch {
4516
- return [];
4517
- }
4518
- }
4519
4503
 
4520
4504
  class JsonModule {
4521
4505
  id = "data/json";
4522
4506
  name = "JSON Search";
4523
- description = "JSON file search with structure-aware indexing";
4524
- version = "1.0.0";
4507
+ description = "JSON file search with literal-based key path indexing";
4508
+ version = "2.0.0";
4525
4509
  supportsFile(filepath) {
4526
4510
  return isJsonFile(filepath);
4527
4511
  }
4528
- embeddingConfig = null;
4529
4512
  symbolicIndex = null;
4513
+ literalIndex = null;
4530
4514
  pendingSummaries = new Map;
4515
+ pendingLiterals = new Map;
4531
4516
  rootDir = "";
4532
4517
  logger = undefined;
4533
4518
  async initialize(config) {
4534
- this.embeddingConfig = getEmbeddingConfigFromModule(config);
4535
4519
  this.logger = config.options?.logger;
4536
- if (this.logger) {
4537
- this.embeddingConfig = {
4538
- ...this.embeddingConfig,
4539
- logger: this.logger
4540
- };
4541
- }
4542
- configureEmbeddings(this.embeddingConfig);
4543
4520
  this.pendingSummaries.clear();
4521
+ this.pendingLiterals.clear();
4544
4522
  }
4545
4523
  async indexFile(filepath, content, ctx) {
4546
4524
  if (!isJsonFile(filepath)) {
4547
4525
  return null;
4548
4526
  }
4549
4527
  this.rootDir = ctx.rootDir;
4550
- const textChunks = createLineBasedChunks(content, {
4551
- chunkSize: 50,
4552
- overlap: 10
4553
- });
4554
- if (textChunks.length === 0) {
4528
+ let parsed;
4529
+ try {
4530
+ parsed = JSON.parse(content);
4531
+ } catch {
4555
4532
  return null;
4556
4533
  }
4557
- const chunkContents = textChunks.map((c) => {
4558
- const filename = path11.basename(filepath);
4559
- return `${filename}: ${c.content}`;
4560
- });
4561
- const embeddings = await getEmbeddings(chunkContents);
4562
- const chunks = textChunks.map((tc, i) => ({
4563
- id: generateChunkId(filepath, tc.startLine, tc.endLine),
4564
- content: tc.content,
4565
- startLine: tc.startLine,
4566
- endLine: tc.endLine,
4567
- type: tc.type
4568
- }));
4569
- const jsonKeys = extractJsonKeys((() => {
4570
- try {
4571
- return JSON.parse(content);
4572
- } catch {
4573
- return {};
4534
+ const fileBasename = path11.basename(filepath, path11.extname(filepath));
4535
+ const jsonPathLiterals = extractJsonPaths(parsed, fileBasename);
4536
+ const lines = content.split(`
4537
+ `);
4538
+ const lineCount = lines.length;
4539
+ const chunkId = generateChunkId(filepath, 1, lineCount);
4540
+ const chunks = [
4541
+ {
4542
+ id: chunkId,
4543
+ content,
4544
+ startLine: 1,
4545
+ endLine: lineCount,
4546
+ type: "file"
4574
4547
  }
4575
- })());
4548
+ ];
4549
+ if (jsonPathLiterals.length > 0) {
4550
+ this.pendingLiterals.set(chunkId, {
4551
+ filepath,
4552
+ literals: jsonPathLiterals
4553
+ });
4554
+ }
4576
4555
  const stats = await ctx.getFileStats(filepath);
4577
- const currentConfig = getEmbeddingConfig();
4578
4556
  const moduleData = {
4579
- embeddings,
4580
- embeddingModel: currentConfig.model,
4581
- jsonKeys
4557
+ jsonPaths: jsonPathLiterals.map((l) => l.value)
4582
4558
  };
4583
- const keywords = extractJsonKeywords(content);
4559
+ const keywords = extractJsonKeywords(parsed);
4584
4560
  const fileSummary = {
4585
4561
  filepath,
4586
- chunkCount: chunks.length,
4562
+ chunkCount: 1,
4587
4563
  chunkTypes: ["file"],
4588
4564
  keywords,
4589
4565
  exports: [],
@@ -4606,7 +4582,24 @@ class JsonModule {
4606
4582
  }
4607
4583
  this.symbolicIndex.buildBM25Index();
4608
4584
  await this.symbolicIndex.save();
4585
+ this.literalIndex = new LiteralIndex(indexDir, this.id);
4586
+ await this.literalIndex.initialize();
4587
+ const indexedFilepaths = new Set;
4588
+ for (const filepath of this.pendingSummaries.keys()) {
4589
+ indexedFilepaths.add(filepath);
4590
+ }
4591
+ for (const { filepath } of this.pendingLiterals.values()) {
4592
+ indexedFilepaths.add(filepath);
4593
+ }
4594
+ for (const filepath of indexedFilepaths) {
4595
+ this.literalIndex.removeFile(filepath);
4596
+ }
4597
+ for (const [chunkId, { filepath, literals }] of this.pendingLiterals) {
4598
+ this.literalIndex.addLiterals(chunkId, filepath, literals);
4599
+ }
4600
+ await this.literalIndex.save();
4609
4601
  this.pendingSummaries.clear();
4602
+ this.pendingLiterals.clear();
4610
4603
  }
4611
4604
  async search(query, ctx, options = {}) {
4612
4605
  const {
@@ -4614,8 +4607,15 @@ class JsonModule {
4614
4607
  minScore = DEFAULT_MIN_SCORE3,
4615
4608
  filePatterns
4616
4609
  } = options;
4610
+ const { literals: queryLiterals, remainingQuery } = parseQueryLiterals(query);
4617
4611
  const indexDir = getRaggrepDir(ctx.rootDir, ctx.config);
4618
4612
  const symbolicIndex = new SymbolicIndex(indexDir, this.id);
4613
+ const literalIndex = new LiteralIndex(indexDir, this.id);
4614
+ let literalMatchMap = new Map;
4615
+ try {
4616
+ await literalIndex.initialize();
4617
+ literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
4618
+ } catch {}
4619
4619
  let allFiles;
4620
4620
  try {
4621
4621
  await symbolicIndex.initialize();
@@ -4635,25 +4635,16 @@ class JsonModule {
4635
4635
  });
4636
4636
  });
4637
4637
  }
4638
- const queryEmbedding = await getEmbedding(query);
4639
4638
  const bm25Index = new BM25Index;
4640
4639
  const allChunksData = [];
4641
4640
  for (const filepath of filesToSearch) {
4642
4641
  const fileIndex = await ctx.loadFileIndex(filepath);
4643
4642
  if (!fileIndex)
4644
4643
  continue;
4645
- const moduleData = fileIndex.moduleData;
4646
- if (!moduleData?.embeddings)
4647
- continue;
4648
- for (let i = 0;i < fileIndex.chunks.length; i++) {
4649
- const chunk = fileIndex.chunks[i];
4650
- const embedding = moduleData.embeddings[i];
4651
- if (!embedding)
4652
- continue;
4644
+ for (const chunk of fileIndex.chunks) {
4653
4645
  allChunksData.push({
4654
4646
  filepath: fileIndex.filepath,
4655
- chunk,
4656
- embedding
4647
+ chunk
4657
4648
  });
4658
4649
  bm25Index.addDocuments([{ id: chunk.id, content: chunk.content }]);
4659
4650
  }
@@ -4663,32 +4654,70 @@ class JsonModule {
4663
4654
  for (const result of bm25Results) {
4664
4655
  bm25Scores.set(result.id, normalizeScore(result.score, 3));
4665
4656
  }
4666
- const queryTerms = extractQueryTerms(query);
4667
4657
  const results = [];
4668
- for (const { filepath, chunk, embedding } of allChunksData) {
4669
- const semanticScore = cosineSimilarity(queryEmbedding, embedding);
4658
+ const processedChunkIds = new Set;
4659
+ for (const { filepath, chunk } of allChunksData) {
4670
4660
  const bm25Score = bm25Scores.get(chunk.id) || 0;
4671
- const hybridScore = SEMANTIC_WEIGHT2 * semanticScore + BM25_WEIGHT2 * bm25Score;
4672
- if (hybridScore >= minScore || bm25Score > 0.3) {
4661
+ const literalMatches = literalMatchMap.get(chunk.id) || [];
4662
+ const literalContribution = calculateLiteralContribution(literalMatches, bm25Score > 0);
4663
+ const baseScore = BM25_WEIGHT2 * bm25Score;
4664
+ const boostedScore = applyLiteralBoost(baseScore, literalMatches, bm25Score > 0);
4665
+ const literalBase = literalMatches.length > 0 && bm25Score === 0 ? LITERAL_SCORING_CONSTANTS.BASE_SCORE * LITERAL_WEIGHT : 0;
4666
+ const finalScore = boostedScore + literalBase;
4667
+ processedChunkIds.add(chunk.id);
4668
+ if (finalScore >= minScore || literalMatches.length > 0) {
4673
4669
  results.push({
4674
4670
  filepath,
4675
4671
  chunk,
4676
- score: hybridScore,
4672
+ score: finalScore,
4677
4673
  moduleId: this.id,
4678
4674
  context: {
4679
- semanticScore,
4680
- bm25Score
4675
+ bm25Score,
4676
+ literalMultiplier: literalContribution.multiplier,
4677
+ literalMatchType: literalContribution.bestMatchType,
4678
+ literalConfidence: literalContribution.bestConfidence,
4679
+ literalMatchCount: literalContribution.matchCount
4681
4680
  }
4682
4681
  });
4683
4682
  }
4684
4683
  }
4684
+ for (const [chunkId, matches] of literalMatchMap) {
4685
+ if (processedChunkIds.has(chunkId)) {
4686
+ continue;
4687
+ }
4688
+ const filepath = matches[0]?.filepath;
4689
+ if (!filepath)
4690
+ continue;
4691
+ const fileIndex = await ctx.loadFileIndex(filepath);
4692
+ if (!fileIndex)
4693
+ continue;
4694
+ const chunk = fileIndex.chunks.find((c) => c.id === chunkId);
4695
+ if (!chunk)
4696
+ continue;
4697
+ const literalContribution = calculateLiteralContribution(matches, false);
4698
+ const score = LITERAL_SCORING_CONSTANTS.BASE_SCORE * literalContribution.multiplier;
4699
+ processedChunkIds.add(chunkId);
4700
+ results.push({
4701
+ filepath,
4702
+ chunk,
4703
+ score,
4704
+ moduleId: this.id,
4705
+ context: {
4706
+ bm25Score: 0,
4707
+ literalMultiplier: literalContribution.multiplier,
4708
+ literalMatchType: literalContribution.bestMatchType,
4709
+ literalConfidence: literalContribution.bestConfidence,
4710
+ literalMatchCount: literalContribution.matchCount,
4711
+ literalOnly: true
4712
+ }
4713
+ });
4714
+ }
4685
4715
  results.sort((a, b) => b.score - a.score);
4686
4716
  return results.slice(0, topK);
4687
4717
  }
4688
4718
  }
4689
- var DEFAULT_MIN_SCORE3 = 0.15, DEFAULT_TOP_K3 = 10, SEMANTIC_WEIGHT2 = 0.7, BM25_WEIGHT2 = 0.3, JSON_EXTENSIONS, supportsFile2;
4719
+ var DEFAULT_MIN_SCORE3 = 0.1, DEFAULT_TOP_K3 = 10, BM25_WEIGHT2 = 0.4, LITERAL_WEIGHT = 0.6, JSON_EXTENSIONS, supportsFile2;
4690
4720
  var init_json = __esm(() => {
4691
- init_embeddings();
4692
4721
  init_services();
4693
4722
  init_config2();
4694
4723
  init_storage();
@@ -4958,7 +4987,7 @@ ${section.content}` : section.content,
4958
4987
  ].includes(t))) {
4959
4988
  docBoost = 0.05;
4960
4989
  }
4961
- const hybridScore = SEMANTIC_WEIGHT3 * semanticScore + BM25_WEIGHT3 * bm25Score + docBoost;
4990
+ const hybridScore = SEMANTIC_WEIGHT2 * semanticScore + BM25_WEIGHT3 * bm25Score + docBoost;
4962
4991
  if (hybridScore >= minScore || bm25Score > 0.3) {
4963
4992
  results.push({
4964
4993
  filepath,
@@ -4977,7 +5006,7 @@ ${section.content}` : section.content,
4977
5006
  return results.slice(0, topK);
4978
5007
  }
4979
5008
  }
4980
- var DEFAULT_MIN_SCORE4 = 0.15, DEFAULT_TOP_K4 = 10, SEMANTIC_WEIGHT3 = 0.7, BM25_WEIGHT3 = 0.3, MARKDOWN_EXTENSIONS, supportsFile3;
5009
+ var DEFAULT_MIN_SCORE4 = 0.15, DEFAULT_TOP_K4 = 10, SEMANTIC_WEIGHT2 = 0.7, BM25_WEIGHT3 = 0.3, MARKDOWN_EXTENSIONS, supportsFile3;
4981
5010
  var init_markdown = __esm(() => {
4982
5011
  init_embeddings();
4983
5012
  init_services();
@@ -6240,7 +6269,7 @@ init_logger();
6240
6269
  // package.json
6241
6270
  var package_default = {
6242
6271
  name: "raggrep",
6243
- version: "0.8.0",
6272
+ version: "0.8.1",
6244
6273
  description: "Local filesystem-based RAG system for codebases - semantic search using local embeddings",
6245
6274
  type: "module",
6246
6275
  main: "./dist/index.js",
@@ -6701,4 +6730,4 @@ Run 'raggrep <command> --help' for more information.
6701
6730
  }
6702
6731
  main();
6703
6732
 
6704
- //# debugId=400EC2685467A28B64756E2164756E21
6733
+ //# debugId=7B73D156971632D164756E2164756E21