raggrep 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,3 +13,4 @@ export { parseQueryLiterals } from "./queryLiteralParser";
13
13
  export { extractLiterals, extractLiteralsWithReferences, } from "./literalExtractor";
14
14
  export { calculateLiteralMultiplier, calculateMaxMultiplier, calculateLiteralContribution, applyLiteralBoost, mergeWithLiteralBoost, LITERAL_SCORING_CONSTANTS, type LiteralScoreContribution, type MergeInput, type MergeOutput, } from "./literalScorer";
15
15
  export { getSynonyms, expandQuery, DEFAULT_LEXICON, EXPANSION_WEIGHTS, DEFAULT_EXPANSION_OPTIONS, } from "./lexicon";
16
+ export { extractJsonPaths, extractJsonKeywords } from "./jsonPathExtractor";
@@ -0,0 +1,29 @@
1
+ /**
2
+ * JSON Path Extractor
3
+ *
4
+ * Extracts dot-notation key paths from JSON objects as literals.
5
+ * Used for literal-based indexing of JSON files.
6
+ *
7
+ * @example
8
+ * // user.json: { name: { first: "john" } }
9
+ * extractJsonPaths({ name: { first: "john" } }, "user")
10
+ * // Returns literals for: "user.name", "user.name.first"
11
+ */
12
+ import type { ExtractedLiteral } from "../entities/literal";
13
+ /**
14
+ * Extract all key paths from a JSON object as literals.
15
+ * Prefixes all paths with the filename (without extension).
16
+ *
17
+ * @param obj - Parsed JSON object
18
+ * @param fileBasename - Filename without extension (e.g., "user" from "user.json")
19
+ * @returns Array of literals representing all dot-notation paths
20
+ */
21
+ export declare function extractJsonPaths(obj: unknown, fileBasename: string): ExtractedLiteral[];
22
+ /**
23
+ * Extract keywords from JSON for BM25 indexing.
24
+ * Extracts both keys and string values.
25
+ *
26
+ * @param obj - Parsed JSON object
27
+ * @returns Array of keywords for BM25 indexing
28
+ */
29
+ export declare function extractJsonKeywords(obj: unknown): string[];
@@ -0,0 +1,4 @@
1
+ /**
2
+ * JSON Path Extractor Tests
3
+ */
4
+ export {};
package/dist/index.js CHANGED
@@ -2548,44 +2548,10 @@ var init_queryIntent = __esm(() => {
2548
2548
  });
2549
2549
 
2550
2550
  // src/domain/services/chunking.ts
2551
- function createLineBasedChunks(content, options = {}) {
2552
- const {
2553
- chunkSize = DEFAULT_CHUNK_SIZE,
2554
- overlap = DEFAULT_OVERLAP,
2555
- minLinesForMultipleChunks = chunkSize
2556
- } = options;
2557
- const lines = content.split(`
2558
- `);
2559
- const chunks = [];
2560
- if (lines.length <= minLinesForMultipleChunks) {
2561
- return [
2562
- {
2563
- content,
2564
- startLine: 1,
2565
- endLine: lines.length,
2566
- type: "file"
2567
- }
2568
- ];
2569
- }
2570
- for (let i = 0;i < lines.length; i += chunkSize - overlap) {
2571
- const endIdx = Math.min(i + chunkSize, lines.length);
2572
- chunks.push({
2573
- content: lines.slice(i, endIdx).join(`
2574
- `),
2575
- startLine: i + 1,
2576
- endLine: endIdx,
2577
- type: "block"
2578
- });
2579
- if (endIdx >= lines.length)
2580
- break;
2581
- }
2582
- return chunks;
2583
- }
2584
2551
  function generateChunkId(filepath, startLine, endLine) {
2585
2552
  const safePath = filepath.replace(/[/\\]/g, "-").replace(/\./g, "_");
2586
2553
  return `${safePath}-${startLine}-${endLine}`;
2587
2554
  }
2588
- var DEFAULT_CHUNK_SIZE = 30, DEFAULT_OVERLAP = 5;
2589
2555
 
2590
2556
  // src/domain/services/queryLiteralParser.ts
2591
2557
  function parseQueryLiterals(query) {
@@ -3453,6 +3419,63 @@ var init_lexicon2 = __esm(() => {
3453
3419
  defaultLookupMap = buildLookupMap(DEFAULT_LEXICON);
3454
3420
  });
3455
3421
 
3422
+ // src/domain/services/jsonPathExtractor.ts
3423
+ function extractJsonPaths(obj, fileBasename) {
3424
+ const paths = extractPathsRecursive(obj, fileBasename);
3425
+ return paths.map((path8) => ({
3426
+ value: path8,
3427
+ type: "identifier",
3428
+ matchType: "definition"
3429
+ }));
3430
+ }
3431
+ function extractPathsRecursive(obj, prefix) {
3432
+ const paths = [];
3433
+ if (obj === null || obj === undefined) {
3434
+ return paths;
3435
+ }
3436
+ if (Array.isArray(obj)) {
3437
+ obj.forEach((item, index) => {
3438
+ const indexedPrefix = `${prefix}[${index}]`;
3439
+ paths.push(indexedPrefix);
3440
+ if (item !== null && typeof item === "object") {
3441
+ paths.push(...extractPathsRecursive(item, indexedPrefix));
3442
+ }
3443
+ });
3444
+ } else if (typeof obj === "object") {
3445
+ for (const [key, value] of Object.entries(obj)) {
3446
+ const fullPath = `${prefix}.${key}`;
3447
+ paths.push(fullPath);
3448
+ if (value !== null && typeof value === "object") {
3449
+ paths.push(...extractPathsRecursive(value, fullPath));
3450
+ }
3451
+ }
3452
+ }
3453
+ return paths;
3454
+ }
3455
+ function extractJsonKeywords(obj) {
3456
+ const keywords = new Set;
3457
+ const extract = (value, parentKey) => {
3458
+ if (value === null || value === undefined) {
3459
+ return;
3460
+ }
3461
+ if (typeof value === "string") {
3462
+ const words = value.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/[\s_\-./]+/).filter((w) => w.length > 2);
3463
+ words.forEach((w) => keywords.add(w));
3464
+ } else if (Array.isArray(value)) {
3465
+ value.forEach((item) => extract(item));
3466
+ } else if (typeof value === "object") {
3467
+ for (const [key, val] of Object.entries(value)) {
3468
+ keywords.add(key.toLowerCase());
3469
+ const keyWords = key.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/[\s_\-]+/).filter((w) => w.length > 2);
3470
+ keyWords.forEach((w) => keywords.add(w));
3471
+ extract(val, key);
3472
+ }
3473
+ }
3474
+ };
3475
+ extract(obj);
3476
+ return Array.from(keywords);
3477
+ }
3478
+
3456
3479
  // src/domain/services/index.ts
3457
3480
  var init_services = __esm(() => {
3458
3481
  init_keywords();
@@ -4383,113 +4406,66 @@ function isJsonFile(filepath) {
4383
4406
  const ext = path11.extname(filepath).toLowerCase();
4384
4407
  return JSON_EXTENSIONS.includes(ext);
4385
4408
  }
4386
- function extractJsonKeys(obj, prefix = "") {
4387
- const keys = [];
4388
- if (obj === null || obj === undefined) {
4389
- return keys;
4390
- }
4391
- if (Array.isArray(obj)) {
4392
- obj.forEach((item, index) => {
4393
- keys.push(...extractJsonKeys(item, `${prefix}[${index}]`));
4394
- });
4395
- } else if (typeof obj === "object") {
4396
- for (const [key, value] of Object.entries(obj)) {
4397
- const fullKey = prefix ? `${prefix}.${key}` : key;
4398
- keys.push(key);
4399
- keys.push(...extractJsonKeys(value, fullKey));
4400
- }
4401
- }
4402
- return keys;
4403
- }
4404
- function extractJsonKeywords(content) {
4405
- try {
4406
- const parsed = JSON.parse(content);
4407
- const keys = extractJsonKeys(parsed);
4408
- const stringValues = [];
4409
- const extractStrings = (obj) => {
4410
- if (typeof obj === "string") {
4411
- const words = obj.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().split(/\s+/).filter((w) => w.length > 2);
4412
- stringValues.push(...words);
4413
- } else if (Array.isArray(obj)) {
4414
- obj.forEach(extractStrings);
4415
- } else if (obj && typeof obj === "object") {
4416
- Object.values(obj).forEach(extractStrings);
4417
- }
4418
- };
4419
- extractStrings(parsed);
4420
- return [...new Set([...keys, ...stringValues])];
4421
- } catch {
4422
- return [];
4423
- }
4424
- }
4425
4409
 
4426
4410
  class JsonModule {
4427
4411
  id = "data/json";
4428
4412
  name = "JSON Search";
4429
- description = "JSON file search with structure-aware indexing";
4430
- version = "1.0.0";
4413
+ description = "JSON file search with literal-based key path indexing";
4414
+ version = "2.0.0";
4431
4415
  supportsFile(filepath) {
4432
4416
  return isJsonFile(filepath);
4433
4417
  }
4434
- embeddingConfig = null;
4435
4418
  symbolicIndex = null;
4419
+ literalIndex = null;
4436
4420
  pendingSummaries = new Map;
4421
+ pendingLiterals = new Map;
4437
4422
  rootDir = "";
4438
4423
  logger = undefined;
4439
4424
  async initialize(config) {
4440
- this.embeddingConfig = getEmbeddingConfigFromModule(config);
4441
4425
  this.logger = config.options?.logger;
4442
- if (this.logger) {
4443
- this.embeddingConfig = {
4444
- ...this.embeddingConfig,
4445
- logger: this.logger
4446
- };
4447
- }
4448
- configureEmbeddings(this.embeddingConfig);
4449
4426
  this.pendingSummaries.clear();
4427
+ this.pendingLiterals.clear();
4450
4428
  }
4451
4429
  async indexFile(filepath, content, ctx) {
4452
4430
  if (!isJsonFile(filepath)) {
4453
4431
  return null;
4454
4432
  }
4455
4433
  this.rootDir = ctx.rootDir;
4456
- const textChunks = createLineBasedChunks(content, {
4457
- chunkSize: 50,
4458
- overlap: 10
4459
- });
4460
- if (textChunks.length === 0) {
4434
+ let parsed;
4435
+ try {
4436
+ parsed = JSON.parse(content);
4437
+ } catch {
4461
4438
  return null;
4462
4439
  }
4463
- const chunkContents = textChunks.map((c) => {
4464
- const filename = path11.basename(filepath);
4465
- return `${filename}: ${c.content}`;
4466
- });
4467
- const embeddings = await getEmbeddings(chunkContents);
4468
- const chunks = textChunks.map((tc, i) => ({
4469
- id: generateChunkId(filepath, tc.startLine, tc.endLine),
4470
- content: tc.content,
4471
- startLine: tc.startLine,
4472
- endLine: tc.endLine,
4473
- type: tc.type
4474
- }));
4475
- const jsonKeys = extractJsonKeys((() => {
4476
- try {
4477
- return JSON.parse(content);
4478
- } catch {
4479
- return {};
4440
+ const fileBasename = path11.basename(filepath, path11.extname(filepath));
4441
+ const jsonPathLiterals = extractJsonPaths(parsed, fileBasename);
4442
+ const lines = content.split(`
4443
+ `);
4444
+ const lineCount = lines.length;
4445
+ const chunkId = generateChunkId(filepath, 1, lineCount);
4446
+ const chunks = [
4447
+ {
4448
+ id: chunkId,
4449
+ content,
4450
+ startLine: 1,
4451
+ endLine: lineCount,
4452
+ type: "file"
4480
4453
  }
4481
- })());
4454
+ ];
4455
+ if (jsonPathLiterals.length > 0) {
4456
+ this.pendingLiterals.set(chunkId, {
4457
+ filepath,
4458
+ literals: jsonPathLiterals
4459
+ });
4460
+ }
4482
4461
  const stats = await ctx.getFileStats(filepath);
4483
- const currentConfig = getEmbeddingConfig();
4484
4462
  const moduleData = {
4485
- embeddings,
4486
- embeddingModel: currentConfig.model,
4487
- jsonKeys
4463
+ jsonPaths: jsonPathLiterals.map((l) => l.value)
4488
4464
  };
4489
- const keywords = extractJsonKeywords(content);
4465
+ const keywords = extractJsonKeywords(parsed);
4490
4466
  const fileSummary = {
4491
4467
  filepath,
4492
- chunkCount: chunks.length,
4468
+ chunkCount: 1,
4493
4469
  chunkTypes: ["file"],
4494
4470
  keywords,
4495
4471
  exports: [],
@@ -4512,7 +4488,24 @@ class JsonModule {
4512
4488
  }
4513
4489
  this.symbolicIndex.buildBM25Index();
4514
4490
  await this.symbolicIndex.save();
4491
+ this.literalIndex = new LiteralIndex(indexDir, this.id);
4492
+ await this.literalIndex.initialize();
4493
+ const indexedFilepaths = new Set;
4494
+ for (const filepath of this.pendingSummaries.keys()) {
4495
+ indexedFilepaths.add(filepath);
4496
+ }
4497
+ for (const { filepath } of this.pendingLiterals.values()) {
4498
+ indexedFilepaths.add(filepath);
4499
+ }
4500
+ for (const filepath of indexedFilepaths) {
4501
+ this.literalIndex.removeFile(filepath);
4502
+ }
4503
+ for (const [chunkId, { filepath, literals }] of this.pendingLiterals) {
4504
+ this.literalIndex.addLiterals(chunkId, filepath, literals);
4505
+ }
4506
+ await this.literalIndex.save();
4515
4507
  this.pendingSummaries.clear();
4508
+ this.pendingLiterals.clear();
4516
4509
  }
4517
4510
  async search(query, ctx, options = {}) {
4518
4511
  const {
@@ -4520,8 +4513,15 @@ class JsonModule {
4520
4513
  minScore = DEFAULT_MIN_SCORE3,
4521
4514
  filePatterns
4522
4515
  } = options;
4516
+ const { literals: queryLiterals, remainingQuery } = parseQueryLiterals(query);
4523
4517
  const indexDir = getRaggrepDir(ctx.rootDir, ctx.config);
4524
4518
  const symbolicIndex = new SymbolicIndex(indexDir, this.id);
4519
+ const literalIndex = new LiteralIndex(indexDir, this.id);
4520
+ let literalMatchMap = new Map;
4521
+ try {
4522
+ await literalIndex.initialize();
4523
+ literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
4524
+ } catch {}
4525
4525
  let allFiles;
4526
4526
  try {
4527
4527
  await symbolicIndex.initialize();
@@ -4541,25 +4541,16 @@ class JsonModule {
4541
4541
  });
4542
4542
  });
4543
4543
  }
4544
- const queryEmbedding = await getEmbedding(query);
4545
4544
  const bm25Index = new BM25Index;
4546
4545
  const allChunksData = [];
4547
4546
  for (const filepath of filesToSearch) {
4548
4547
  const fileIndex = await ctx.loadFileIndex(filepath);
4549
4548
  if (!fileIndex)
4550
4549
  continue;
4551
- const moduleData = fileIndex.moduleData;
4552
- if (!moduleData?.embeddings)
4553
- continue;
4554
- for (let i = 0;i < fileIndex.chunks.length; i++) {
4555
- const chunk = fileIndex.chunks[i];
4556
- const embedding = moduleData.embeddings[i];
4557
- if (!embedding)
4558
- continue;
4550
+ for (const chunk of fileIndex.chunks) {
4559
4551
  allChunksData.push({
4560
4552
  filepath: fileIndex.filepath,
4561
- chunk,
4562
- embedding
4553
+ chunk
4563
4554
  });
4564
4555
  bm25Index.addDocuments([{ id: chunk.id, content: chunk.content }]);
4565
4556
  }
@@ -4569,32 +4560,70 @@ class JsonModule {
4569
4560
  for (const result of bm25Results) {
4570
4561
  bm25Scores.set(result.id, normalizeScore(result.score, 3));
4571
4562
  }
4572
- const queryTerms = extractQueryTerms(query);
4573
4563
  const results = [];
4574
- for (const { filepath, chunk, embedding } of allChunksData) {
4575
- const semanticScore = cosineSimilarity(queryEmbedding, embedding);
4564
+ const processedChunkIds = new Set;
4565
+ for (const { filepath, chunk } of allChunksData) {
4576
4566
  const bm25Score = bm25Scores.get(chunk.id) || 0;
4577
- const hybridScore = SEMANTIC_WEIGHT2 * semanticScore + BM25_WEIGHT2 * bm25Score;
4578
- if (hybridScore >= minScore || bm25Score > 0.3) {
4567
+ const literalMatches = literalMatchMap.get(chunk.id) || [];
4568
+ const literalContribution = calculateLiteralContribution(literalMatches, bm25Score > 0);
4569
+ const baseScore = BM25_WEIGHT2 * bm25Score;
4570
+ const boostedScore = applyLiteralBoost(baseScore, literalMatches, bm25Score > 0);
4571
+ const literalBase = literalMatches.length > 0 && bm25Score === 0 ? LITERAL_SCORING_CONSTANTS.BASE_SCORE * LITERAL_WEIGHT : 0;
4572
+ const finalScore = boostedScore + literalBase;
4573
+ processedChunkIds.add(chunk.id);
4574
+ if (finalScore >= minScore || literalMatches.length > 0) {
4579
4575
  results.push({
4580
4576
  filepath,
4581
4577
  chunk,
4582
- score: hybridScore,
4578
+ score: finalScore,
4583
4579
  moduleId: this.id,
4584
4580
  context: {
4585
- semanticScore,
4586
- bm25Score
4581
+ bm25Score,
4582
+ literalMultiplier: literalContribution.multiplier,
4583
+ literalMatchType: literalContribution.bestMatchType,
4584
+ literalConfidence: literalContribution.bestConfidence,
4585
+ literalMatchCount: literalContribution.matchCount
4587
4586
  }
4588
4587
  });
4589
4588
  }
4590
4589
  }
4590
+ for (const [chunkId, matches] of literalMatchMap) {
4591
+ if (processedChunkIds.has(chunkId)) {
4592
+ continue;
4593
+ }
4594
+ const filepath = matches[0]?.filepath;
4595
+ if (!filepath)
4596
+ continue;
4597
+ const fileIndex = await ctx.loadFileIndex(filepath);
4598
+ if (!fileIndex)
4599
+ continue;
4600
+ const chunk = fileIndex.chunks.find((c) => c.id === chunkId);
4601
+ if (!chunk)
4602
+ continue;
4603
+ const literalContribution = calculateLiteralContribution(matches, false);
4604
+ const score = LITERAL_SCORING_CONSTANTS.BASE_SCORE * literalContribution.multiplier;
4605
+ processedChunkIds.add(chunkId);
4606
+ results.push({
4607
+ filepath,
4608
+ chunk,
4609
+ score,
4610
+ moduleId: this.id,
4611
+ context: {
4612
+ bm25Score: 0,
4613
+ literalMultiplier: literalContribution.multiplier,
4614
+ literalMatchType: literalContribution.bestMatchType,
4615
+ literalConfidence: literalContribution.bestConfidence,
4616
+ literalMatchCount: literalContribution.matchCount,
4617
+ literalOnly: true
4618
+ }
4619
+ });
4620
+ }
4591
4621
  results.sort((a, b) => b.score - a.score);
4592
4622
  return results.slice(0, topK);
4593
4623
  }
4594
4624
  }
4595
- var DEFAULT_MIN_SCORE3 = 0.15, DEFAULT_TOP_K3 = 10, SEMANTIC_WEIGHT2 = 0.7, BM25_WEIGHT2 = 0.3, JSON_EXTENSIONS, supportsFile2;
4625
+ var DEFAULT_MIN_SCORE3 = 0.1, DEFAULT_TOP_K3 = 10, BM25_WEIGHT2 = 0.4, LITERAL_WEIGHT = 0.6, JSON_EXTENSIONS, supportsFile2;
4596
4626
  var init_json = __esm(() => {
4597
- init_embeddings();
4598
4627
  init_services();
4599
4628
  init_config2();
4600
4629
  init_storage();
@@ -4864,7 +4893,7 @@ ${section.content}` : section.content,
4864
4893
  ].includes(t))) {
4865
4894
  docBoost = 0.05;
4866
4895
  }
4867
- const hybridScore = SEMANTIC_WEIGHT3 * semanticScore + BM25_WEIGHT3 * bm25Score + docBoost;
4896
+ const hybridScore = SEMANTIC_WEIGHT2 * semanticScore + BM25_WEIGHT3 * bm25Score + docBoost;
4868
4897
  if (hybridScore >= minScore || bm25Score > 0.3) {
4869
4898
  results.push({
4870
4899
  filepath,
@@ -4883,7 +4912,7 @@ ${section.content}` : section.content,
4883
4912
  return results.slice(0, topK);
4884
4913
  }
4885
4914
  }
4886
- var DEFAULT_MIN_SCORE4 = 0.15, DEFAULT_TOP_K4 = 10, SEMANTIC_WEIGHT3 = 0.7, BM25_WEIGHT3 = 0.3, MARKDOWN_EXTENSIONS, supportsFile3;
4915
+ var DEFAULT_MIN_SCORE4 = 0.15, DEFAULT_TOP_K4 = 10, SEMANTIC_WEIGHT2 = 0.7, BM25_WEIGHT3 = 0.3, MARKDOWN_EXTENSIONS, supportsFile3;
4887
4916
  var init_markdown = __esm(() => {
4888
4917
  init_embeddings();
4889
4918
  init_services();
@@ -6058,4 +6087,4 @@ export {
6058
6087
  ConsoleLogger
6059
6088
  };
6060
6089
 
6061
- //# debugId=59B4DA12592C31BA64756E2164756E21
6090
+ //# debugId=7A45B6717CB7C82E64756E2164756E21