raggrep 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +138 -6
- package/dist/cli/main.js +539 -36
- package/dist/cli/main.js.map +8 -7
- package/dist/domain/entities/fileSummary.d.ts +18 -0
- package/dist/domain/services/keywords.d.ts +45 -0
- package/dist/index.js +141 -7
- package/dist/index.js.map +7 -6
- package/dist/indexer/index.d.ts +25 -0
- package/dist/indexer/watcher.d.ts +33 -0
- package/package.json +4 -3
|
@@ -9,6 +9,11 @@
|
|
|
9
9
|
* These appear in almost every code file and don't add search value.
|
|
10
10
|
*/
|
|
11
11
|
export declare const COMMON_KEYWORDS: Set<string>;
|
|
12
|
+
/**
|
|
13
|
+
* Common architectural layer patterns in file names/paths.
|
|
14
|
+
* Used to detect the layer a file belongs to.
|
|
15
|
+
*/
|
|
16
|
+
export declare const LAYER_PATTERNS: Record<string, string[]>;
|
|
12
17
|
/**
|
|
13
18
|
* Extract keywords from code content and optional name.
|
|
14
19
|
*
|
|
@@ -21,7 +26,47 @@ export declare function extractKeywords(content: string, name?: string, maxKeywo
|
|
|
21
26
|
/**
|
|
22
27
|
* Extract keywords from a file path.
|
|
23
28
|
*
|
|
29
|
+
* Enhanced extraction that:
|
|
30
|
+
* - Splits camelCase/PascalCase filenames
|
|
31
|
+
* - Extracts directory segments
|
|
32
|
+
* - Recognizes common patterns (Service, Controller, etc.)
|
|
33
|
+
*
|
|
24
34
|
* @param filepath - File path to extract keywords from
|
|
25
35
|
* @returns Array of keywords from path segments
|
|
26
36
|
*/
|
|
27
37
|
export declare function extractPathKeywords(filepath: string): string[];
|
|
38
|
+
/**
|
|
39
|
+
* Path context information extracted from a file path.
|
|
40
|
+
*/
|
|
41
|
+
export interface PathContext {
|
|
42
|
+
/** Directory segments (excluding filename) */
|
|
43
|
+
segments: string[];
|
|
44
|
+
/** Detected architectural layer (service, controller, repository, etc.) */
|
|
45
|
+
layer?: string;
|
|
46
|
+
/** Detected feature domain (auth, users, payments, etc.) */
|
|
47
|
+
domain?: string;
|
|
48
|
+
/** Path depth (number of directory levels) */
|
|
49
|
+
depth: number;
|
|
50
|
+
/** Keywords extracted from the path */
|
|
51
|
+
keywords: string[];
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Parse a file path and extract structural context.
|
|
55
|
+
*
|
|
56
|
+
* This helps with:
|
|
57
|
+
* - Boosting files in related directories
|
|
58
|
+
* - Understanding architectural layer
|
|
59
|
+
* - Grouping by feature domain
|
|
60
|
+
*
|
|
61
|
+
* @param filepath - File path to parse
|
|
62
|
+
* @returns Parsed path context
|
|
63
|
+
*/
|
|
64
|
+
export declare function parsePathContext(filepath: string): PathContext;
|
|
65
|
+
/**
|
|
66
|
+
* Generate a path context string for embedding.
|
|
67
|
+
* This is prepended to content to give the embedding model path awareness.
|
|
68
|
+
*
|
|
69
|
+
* @param pathContext - Parsed path context
|
|
70
|
+
* @returns A string representation of the path context
|
|
71
|
+
*/
|
|
72
|
+
export declare function formatPathContextForEmbedding(pathContext: PathContext): string;
|
package/dist/index.js
CHANGED
|
@@ -544,10 +544,92 @@ function extractKeywords(content, name, maxKeywords = 50) {
|
|
|
544
544
|
}
|
|
545
545
|
return Array.from(keywords).slice(0, maxKeywords);
|
|
546
546
|
}
|
|
547
|
+
function splitIdentifier(str) {
|
|
548
|
+
return str.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[_-]/g, " ").split(/\s+/).map((s) => s.toLowerCase()).filter((s) => s.length > 1);
|
|
549
|
+
}
|
|
547
550
|
function extractPathKeywords(filepath) {
|
|
548
|
-
|
|
551
|
+
const keywords = new Set;
|
|
552
|
+
const pathWithoutExt = filepath.replace(/\.[^.]+$/, "");
|
|
553
|
+
const segments = pathWithoutExt.split(/[/\\]/);
|
|
554
|
+
for (const segment of segments) {
|
|
555
|
+
if (segment.length < 2)
|
|
556
|
+
continue;
|
|
557
|
+
const lower = segment.toLowerCase();
|
|
558
|
+
if (!COMMON_KEYWORDS.has(lower) && lower.length > 2) {
|
|
559
|
+
keywords.add(lower);
|
|
560
|
+
}
|
|
561
|
+
const parts = splitIdentifier(segment);
|
|
562
|
+
for (const part of parts) {
|
|
563
|
+
if (!COMMON_KEYWORDS.has(part) && part.length > 2) {
|
|
564
|
+
keywords.add(part);
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
return Array.from(keywords);
|
|
549
569
|
}
|
|
550
|
-
|
|
570
|
+
function parsePathContext(filepath) {
|
|
571
|
+
const pathWithoutExt = filepath.replace(/\.[^.]+$/, "");
|
|
572
|
+
const allSegments = pathWithoutExt.split(/[/\\]/);
|
|
573
|
+
const filename = allSegments[allSegments.length - 1];
|
|
574
|
+
const dirSegments = allSegments.slice(0, -1);
|
|
575
|
+
const keywords = extractPathKeywords(filepath);
|
|
576
|
+
let layer;
|
|
577
|
+
const allLower = [...dirSegments, filename].map((s) => s.toLowerCase()).join(" ");
|
|
578
|
+
const filenameLower = filename.toLowerCase();
|
|
579
|
+
for (const [layerName, patterns] of Object.entries(LAYER_PATTERNS)) {
|
|
580
|
+
for (const pattern of patterns) {
|
|
581
|
+
if (filenameLower.includes(pattern)) {
|
|
582
|
+
layer = layerName;
|
|
583
|
+
break;
|
|
584
|
+
}
|
|
585
|
+
if (dirSegments.some((s) => s.toLowerCase() === pattern)) {
|
|
586
|
+
layer = layerName;
|
|
587
|
+
break;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
if (layer)
|
|
591
|
+
break;
|
|
592
|
+
}
|
|
593
|
+
let domain;
|
|
594
|
+
const layerPatternSet = new Set(Object.values(LAYER_PATTERNS).flat());
|
|
595
|
+
const reversedSegments = [...dirSegments].reverse();
|
|
596
|
+
for (const segment of reversedSegments) {
|
|
597
|
+
const lower = segment.toLowerCase();
|
|
598
|
+
if (["src", "lib", "app", "packages", "modules"].includes(lower))
|
|
599
|
+
continue;
|
|
600
|
+
if (layerPatternSet.has(lower))
|
|
601
|
+
continue;
|
|
602
|
+
if (lower.length > 2) {
|
|
603
|
+
domain = lower;
|
|
604
|
+
break;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
return {
|
|
608
|
+
segments: dirSegments,
|
|
609
|
+
layer,
|
|
610
|
+
domain,
|
|
611
|
+
depth: dirSegments.length,
|
|
612
|
+
keywords
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
function formatPathContextForEmbedding(pathContext) {
|
|
616
|
+
const parts = [];
|
|
617
|
+
if (pathContext.domain) {
|
|
618
|
+
parts.push(pathContext.domain);
|
|
619
|
+
}
|
|
620
|
+
if (pathContext.layer) {
|
|
621
|
+
parts.push(pathContext.layer);
|
|
622
|
+
}
|
|
623
|
+
const significantSegments = pathContext.segments.slice(-3).filter((s) => s.length > 2 && !["src", "lib", "app"].includes(s.toLowerCase()));
|
|
624
|
+
if (significantSegments.length > 0) {
|
|
625
|
+
parts.push(...significantSegments.map((s) => s.toLowerCase()));
|
|
626
|
+
}
|
|
627
|
+
if (parts.length === 0)
|
|
628
|
+
return "";
|
|
629
|
+
const unique = [...new Set(parts)];
|
|
630
|
+
return `[${unique.join(" ")}]`;
|
|
631
|
+
}
|
|
632
|
+
var COMMON_KEYWORDS, LAYER_PATTERNS;
|
|
551
633
|
var init_keywords = __esm(() => {
|
|
552
634
|
COMMON_KEYWORDS = new Set([
|
|
553
635
|
"const",
|
|
@@ -617,6 +699,19 @@ var init_keywords = __esm(() => {
|
|
|
617
699
|
"has",
|
|
618
700
|
"have"
|
|
619
701
|
]);
|
|
702
|
+
LAYER_PATTERNS = {
|
|
703
|
+
controller: ["controller", "controllers", "handler", "handlers", "route", "routes", "api"],
|
|
704
|
+
service: ["service", "services", "usecase", "usecases", "application"],
|
|
705
|
+
repository: ["repository", "repositories", "repo", "repos", "dao", "store", "storage"],
|
|
706
|
+
model: ["model", "models", "entity", "entities", "schema", "schemas"],
|
|
707
|
+
util: ["util", "utils", "utility", "utilities", "helper", "helpers", "common", "shared"],
|
|
708
|
+
config: ["config", "configs", "configuration", "settings"],
|
|
709
|
+
middleware: ["middleware", "middlewares", "interceptor", "interceptors"],
|
|
710
|
+
domain: ["domain", "core", "business"],
|
|
711
|
+
infrastructure: ["infrastructure", "infra", "external", "adapters"],
|
|
712
|
+
presentation: ["presentation", "view", "views", "component", "components", "ui"],
|
|
713
|
+
test: ["test", "tests", "spec", "specs", "__tests__", "__test__"]
|
|
714
|
+
};
|
|
620
715
|
});
|
|
621
716
|
|
|
622
717
|
// src/utils/tieredIndex.ts
|
|
@@ -795,7 +890,12 @@ class SemanticModule {
|
|
|
795
890
|
if (parsedChunks.length === 0) {
|
|
796
891
|
return null;
|
|
797
892
|
}
|
|
798
|
-
const
|
|
893
|
+
const pathContext = parsePathContext(filepath);
|
|
894
|
+
const pathPrefix = formatPathContextForEmbedding(pathContext);
|
|
895
|
+
const chunkContents = parsedChunks.map((c) => {
|
|
896
|
+
const namePrefix = c.name ? `${c.name}: ` : "";
|
|
897
|
+
return `${pathPrefix} ${namePrefix}${c.content}`;
|
|
898
|
+
});
|
|
799
899
|
const embeddings = await getEmbeddings(chunkContents);
|
|
800
900
|
const chunks = parsedChunks.map((pc) => ({
|
|
801
901
|
id: generateChunkId(filepath, pc.startLine, pc.endLine),
|
|
@@ -821,13 +921,20 @@ class SemanticModule {
|
|
|
821
921
|
const keywords = extractKeywords(pc.content, pc.name);
|
|
822
922
|
keywords.forEach((k) => allKeywords.add(k));
|
|
823
923
|
}
|
|
924
|
+
pathContext.keywords.forEach((k) => allKeywords.add(k));
|
|
824
925
|
const fileSummary = {
|
|
825
926
|
filepath,
|
|
826
927
|
chunkCount: chunks.length,
|
|
827
928
|
chunkTypes,
|
|
828
929
|
keywords: Array.from(allKeywords),
|
|
829
930
|
exports,
|
|
830
|
-
lastModified: stats.lastModified
|
|
931
|
+
lastModified: stats.lastModified,
|
|
932
|
+
pathContext: {
|
|
933
|
+
segments: pathContext.segments,
|
|
934
|
+
layer: pathContext.layer,
|
|
935
|
+
domain: pathContext.domain,
|
|
936
|
+
depth: pathContext.depth
|
|
937
|
+
}
|
|
831
938
|
};
|
|
832
939
|
this.pendingSummaries.set(filepath, fileSummary);
|
|
833
940
|
return {
|
|
@@ -904,11 +1011,32 @@ class SemanticModule {
|
|
|
904
1011
|
for (const result of bm25Results) {
|
|
905
1012
|
bm25Scores.set(result.id, normalizeScore(result.score, 3));
|
|
906
1013
|
}
|
|
1014
|
+
const queryTerms = query.toLowerCase().split(/\s+/).filter((t) => t.length > 2);
|
|
1015
|
+
const pathBoosts = new Map;
|
|
1016
|
+
for (const filepath of candidateFiles) {
|
|
1017
|
+
const summary = symbolicIndex.getFileSummary(filepath);
|
|
1018
|
+
if (summary?.pathContext) {
|
|
1019
|
+
let boost = 0;
|
|
1020
|
+
const ctx2 = summary.pathContext;
|
|
1021
|
+
if (ctx2.domain && queryTerms.some((t) => ctx2.domain.includes(t) || t.includes(ctx2.domain))) {
|
|
1022
|
+
boost += 0.1;
|
|
1023
|
+
}
|
|
1024
|
+
if (ctx2.layer && queryTerms.some((t) => ctx2.layer.includes(t) || t.includes(ctx2.layer))) {
|
|
1025
|
+
boost += 0.05;
|
|
1026
|
+
}
|
|
1027
|
+
const segmentMatch = ctx2.segments.some((seg) => queryTerms.some((t) => seg.toLowerCase().includes(t) || t.includes(seg.toLowerCase())));
|
|
1028
|
+
if (segmentMatch) {
|
|
1029
|
+
boost += 0.05;
|
|
1030
|
+
}
|
|
1031
|
+
pathBoosts.set(filepath, boost);
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
907
1034
|
const results = [];
|
|
908
1035
|
for (const { filepath, chunk, embedding } of allChunksData) {
|
|
909
1036
|
const semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
910
1037
|
const bm25Score = bm25Scores.get(chunk.id) || 0;
|
|
911
|
-
const
|
|
1038
|
+
const pathBoost = pathBoosts.get(filepath) || 0;
|
|
1039
|
+
const hybridScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + pathBoost;
|
|
912
1040
|
if (hybridScore >= minScore || bm25Score > 0.3) {
|
|
913
1041
|
results.push({
|
|
914
1042
|
filepath,
|
|
@@ -917,7 +1045,8 @@ class SemanticModule {
|
|
|
917
1045
|
moduleId: this.id,
|
|
918
1046
|
context: {
|
|
919
1047
|
semanticScore,
|
|
920
|
-
bm25Score
|
|
1048
|
+
bm25Score,
|
|
1049
|
+
pathBoost
|
|
921
1050
|
}
|
|
922
1051
|
});
|
|
923
1052
|
}
|
|
@@ -956,6 +1085,7 @@ var init_semantic = __esm(() => {
|
|
|
956
1085
|
init_config2();
|
|
957
1086
|
init_parseCode();
|
|
958
1087
|
init_tieredIndex();
|
|
1088
|
+
init_keywords();
|
|
959
1089
|
});
|
|
960
1090
|
|
|
961
1091
|
// src/indexer/index.ts
|
|
@@ -990,6 +1120,10 @@ async function registerBuiltInModules() {
|
|
|
990
1120
|
registry.register(new SemanticModule2);
|
|
991
1121
|
}
|
|
992
1122
|
|
|
1123
|
+
// src/indexer/watcher.ts
|
|
1124
|
+
import { watch } from "chokidar";
|
|
1125
|
+
init_config2();
|
|
1126
|
+
|
|
993
1127
|
// src/indexer/index.ts
|
|
994
1128
|
async function indexDirectory(rootDir, options = {}) {
|
|
995
1129
|
const verbose = options.verbose ?? false;
|
|
@@ -1375,4 +1509,4 @@ export {
|
|
|
1375
1509
|
cleanup
|
|
1376
1510
|
};
|
|
1377
1511
|
|
|
1378
|
-
//# debugId=
|
|
1512
|
+
//# debugId=791A08B2C54816DA64756E2164756E21
|