kiri-mcp-server 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -5
- package/config/default.example.yml +9 -0
- package/config/scoring-profiles.yml +11 -6
- package/dist/config/default.example.yml +9 -0
- package/dist/config/scoring-profiles.yml +11 -6
- package/dist/package.json +1 -1
- package/dist/server/context.js +0 -1
- package/dist/server/handlers.js +547 -79
- package/dist/server/scoring.js +8 -3
- package/dist/shared/duckdb.js +0 -2
- package/dist/shared/embedding.js +15 -2
- package/dist/shared/tokenizer.js +0 -1
- package/dist/shared/utils/simpleYaml.js +0 -1
- package/dist/src/server/handlers.d.ts.map +1 -1
- package/dist/src/server/handlers.js +234 -26
- package/dist/src/server/handlers.js.map +1 -1
- package/dist/src/server/rpc.d.ts.map +1 -1
- package/dist/src/server/rpc.js +9 -3
- package/dist/src/server/rpc.js.map +1 -1
- package/dist/src/server/scoring.d.ts +2 -0
- package/dist/src/server/scoring.d.ts.map +1 -1
- package/dist/src/server/scoring.js +13 -1
- package/dist/src/server/scoring.js.map +1 -1
- package/dist/src/shared/duckdb.d.ts +1 -0
- package/dist/src/shared/duckdb.d.ts.map +1 -1
- package/dist/src/shared/duckdb.js +54 -3
- package/dist/src/shared/duckdb.js.map +1 -1
- package/dist/src/shared/embedding.d.ts.map +1 -1
- package/dist/src/shared/embedding.js +2 -8
- package/dist/src/shared/embedding.js.map +1 -1
- package/dist/src/shared/tokenizer.d.ts +18 -0
- package/dist/src/shared/tokenizer.d.ts.map +1 -1
- package/dist/src/shared/tokenizer.js +35 -0
- package/dist/src/shared/tokenizer.js.map +1 -1
- package/package.json +1 -1
package/dist/server/scoring.js
CHANGED
|
@@ -12,7 +12,7 @@ function validateWeights(weights, profileName) {
|
|
|
12
12
|
if (typeof weights !== "object" || weights === null) {
|
|
13
13
|
throw new Error(`Profile '${profileName}' must be an object`);
|
|
14
14
|
}
|
|
15
|
-
const required = ["textMatch", "editingPath", "dependency", "proximity", "structural"];
|
|
15
|
+
const required = ["textMatch", "pathMatch", "editingPath", "dependency", "proximity", "structural"];
|
|
16
16
|
const obj = weights;
|
|
17
17
|
for (const key of required) {
|
|
18
18
|
const value = obj[key];
|
|
@@ -28,8 +28,9 @@ function loadProfilesFromConfig() {
|
|
|
28
28
|
}
|
|
29
29
|
try {
|
|
30
30
|
// 環境変数でカスタムパスを指定可能
|
|
31
|
+
// 本番環境(npm install)では dist/config/ を、開発環境では config/ を参照
|
|
31
32
|
const configPath = process.env.KIRI_SCORING_CONFIG ??
|
|
32
|
-
join(fileURLToPath(import.meta.url), "
|
|
33
|
+
join(fileURLToPath(import.meta.url), "../../config/scoring-profiles.yml");
|
|
33
34
|
const configContent = readFileSync(configPath, "utf-8");
|
|
34
35
|
const parsed = parseSimpleYaml(configContent);
|
|
35
36
|
// 必須プロファイルの検証とウェイトのバリデーション
|
|
@@ -56,6 +57,7 @@ function loadProfilesFromConfig() {
|
|
|
56
57
|
profilesCache = {
|
|
57
58
|
default: {
|
|
58
59
|
textMatch: 1.0,
|
|
60
|
+
pathMatch: 1.5,
|
|
59
61
|
editingPath: 2.0,
|
|
60
62
|
dependency: 0.5,
|
|
61
63
|
proximity: 0.25,
|
|
@@ -63,6 +65,7 @@ function loadProfilesFromConfig() {
|
|
|
63
65
|
},
|
|
64
66
|
bugfix: {
|
|
65
67
|
textMatch: 1.0,
|
|
68
|
+
pathMatch: 1.5,
|
|
66
69
|
editingPath: 1.8,
|
|
67
70
|
dependency: 0.7,
|
|
68
71
|
proximity: 0.35,
|
|
@@ -70,6 +73,7 @@ function loadProfilesFromConfig() {
|
|
|
70
73
|
},
|
|
71
74
|
testfail: {
|
|
72
75
|
textMatch: 1.0,
|
|
76
|
+
pathMatch: 1.5,
|
|
73
77
|
editingPath: 1.6,
|
|
74
78
|
dependency: 0.85,
|
|
75
79
|
proximity: 0.3,
|
|
@@ -77,6 +81,7 @@ function loadProfilesFromConfig() {
|
|
|
77
81
|
},
|
|
78
82
|
typeerror: {
|
|
79
83
|
textMatch: 1.0,
|
|
84
|
+
pathMatch: 1.5,
|
|
80
85
|
editingPath: 1.4,
|
|
81
86
|
dependency: 0.6,
|
|
82
87
|
proximity: 0.4,
|
|
@@ -84,6 +89,7 @@ function loadProfilesFromConfig() {
|
|
|
84
89
|
},
|
|
85
90
|
feature: {
|
|
86
91
|
textMatch: 1.0,
|
|
92
|
+
pathMatch: 1.5,
|
|
87
93
|
editingPath: 1.5,
|
|
88
94
|
dependency: 0.45,
|
|
89
95
|
proximity: 0.5,
|
|
@@ -108,4 +114,3 @@ export function loadScoringProfile(profileName) {
|
|
|
108
114
|
}
|
|
109
115
|
return profiles.default;
|
|
110
116
|
}
|
|
111
|
-
//# sourceMappingURL=scoring.js.map
|
package/dist/shared/duckdb.js
CHANGED
|
@@ -20,7 +20,6 @@ function assertNoUndefined(sql, params) {
|
|
|
20
20
|
}
|
|
21
21
|
}
|
|
22
22
|
export class DuckDBClient {
|
|
23
|
-
database;
|
|
24
23
|
constructor(path) {
|
|
25
24
|
this.database = new duckdb.Database(path);
|
|
26
25
|
}
|
|
@@ -118,4 +117,3 @@ export class DuckDBClient {
|
|
|
118
117
|
});
|
|
119
118
|
}
|
|
120
119
|
}
|
|
121
|
-
//# sourceMappingURL=duckdb.js.map
|
package/dist/shared/embedding.js
CHANGED
|
@@ -1,9 +1,23 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
const DEFAULT_DIMS = 64;
|
|
3
|
+
/**
|
|
4
|
+
* 埋め込み生成用のトークン化
|
|
5
|
+
* keyword extractionと同じ戦略を使用してハイフン区切り用語を保持
|
|
6
|
+
*/
|
|
3
7
|
function tokenize(text) {
|
|
8
|
+
const strategy = process.env.KIRI_TOKENIZATION_STRATEGY?.toLowerCase();
|
|
9
|
+
// レガシーモード: ハイフンも分割(従来の動作)
|
|
10
|
+
if (strategy === "legacy") {
|
|
11
|
+
return text
|
|
12
|
+
.toLowerCase()
|
|
13
|
+
.split(/[^\p{L}\p{N}_]+/u)
|
|
14
|
+
.map((token) => token.trim())
|
|
15
|
+
.filter((token) => token.length > 0);
|
|
16
|
+
}
|
|
17
|
+
// phrase-aware または hybrid モード: ハイフンを保持
|
|
4
18
|
return text
|
|
5
19
|
.toLowerCase()
|
|
6
|
-
.split(/[^\p{L}\p{N}_]+/u)
|
|
20
|
+
.split(/[^\p{L}\p{N}_-]+/u)
|
|
7
21
|
.map((token) => token.trim())
|
|
8
22
|
.filter((token) => token.length > 0);
|
|
9
23
|
}
|
|
@@ -82,4 +96,3 @@ export function structuralSimilarity(a, b) {
|
|
|
82
96
|
/** @deprecated Use structuralSimilarity() instead. Kept for backward compatibility. */
|
|
83
97
|
export const cosineSimilarity = structuralSimilarity;
|
|
84
98
|
export const EMBEDDING_DIMS = DEFAULT_DIMS;
|
|
85
|
-
//# sourceMappingURL=embedding.js.map
|
package/dist/shared/tokenizer.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"handlers.d.ts","sourceRoot":"","sources":["../../../src/server/handlers.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AAInD,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAG7C,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;CAC7C;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACpB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED,MAAM,WAAW,sBAAsB;IACrC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,CAAC,EAAE,sBAAsB,CAAC;IACnC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IAC5C,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,GAAG,EAAE,MAAM,EAAE,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,mBAAmB;IAClC,OAAO,EAAE,iBAAiB,EAAE,CAAC;IAC7B,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,4BAA4B;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,oBAAoB;IACnC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,4BAA4B,EAAE,CAAC;IAC3C,CAAC,CAAC,EAAE,MAAM,CAAC;IACX,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,oBAAoB;IACnC,UAAU,EAAE,kBAAkB,EAAE,CAAC;CAClC;AAgDD,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,UAAU,GAAG,SAAS,CAAC;IACnC,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,GAAG,SAAS,CAAC;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,GAAG,SAAS,CAAC;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,UAAU,GAAG,SAAS,CAAC;IAClC,KAAK,EAAE,eAAe,EAAE,CAAC;IACzB,KAAK,EAAE,eAAe,EAAE,CAAC;CAC1B;
|
|
1
|
+
{"version":3,"file":"handlers.d.ts","sourceRoot":"","sources":["../../../src/server/handlers.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AAInD,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAG7C,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;CAC7C;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACpB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED,MAAM,WAAW,sBAAsB;IACrC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,CAAC,EAAE,sBAAsB,CAAC;IACnC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IAC5C,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,GAAG,EAAE,MAAM,EAAE,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,mBAAmB;IAClC,OAAO,EAAE,iBAAiB,EAAE,CAAC;IAC7B,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,4BAA4B;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,oBAAoB;IACnC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,4BAA4B,EAAE,CAAC;IAC3C,CAAC,CAAC,EAAE,MAAM,CAAC;IACX,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,oBAAoB;IACnC,UAAU,EAAE,kBAAkB,EAAE,CAAC;CAClC;AAgDD,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,UAAU,GAAG,SAAS,CAAC;IACnC,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,GAAG,SAAS,CAAC;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,GAAG,SAAS,CAAC;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,UAAU,GAAG,SAAS,CAAC;IAClC,KAAK,EAAE,eAAe,EAAE,CAAC;IACzB,KAAK,EAAE,eAAe,EAAE,CAAC;CAC1B;AA6rBD,wBAAsB,WAAW,CAC/B,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,iBAAiB,GACxB,OAAO,CAAC,iBAAiB,EAAE,CAAC,CA+G9B;AAED,wBAAsB,WAAW,CAC/B,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,iBAAiB,GACxB,OAAO,CAAC,aAAa,CAAC,CA0GxB;AAED,wBAAsB,aAAa,CACjC,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,mBAAmB,GAC1B,OAAO,CAAC,mBAAmB,CAAC,CAkZ9B;AAED,wBAAsB,cAAc,CAClC,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,oBAAoB,GAC3B,OAAO,CAAC,oBAAoB,CAAC,CA+E/B;AAED,wBAAsB,WAAW,CAC/B,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,iBAAiB,GACxB,OAAO,CAAC,iBAAiB,CAAC,CAuJ5B;AAED,wBAAsB,aAAa,CAAC,EAAE,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAqBvF"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import path from "node:path";
|
|
2
2
|
import { generateEmbedding, structuralSimilarity } from "../shared/embedding.js";
|
|
3
|
-
import { encode as encodeGPT } from "../shared/tokenizer.js";
|
|
3
|
+
import { encode as encodeGPT, tokenizeText } from "../shared/tokenizer.js";
|
|
4
4
|
import { coerceProfileName, loadScoringProfile } from "./scoring.js";
|
|
5
5
|
const DEFAULT_SEARCH_LIMIT = 50;
|
|
6
6
|
const DEFAULT_SNIPPET_WINDOW = 150;
|
|
@@ -73,22 +73,126 @@ function normalizeBundleLimit(limit) {
|
|
|
73
73
|
}
|
|
74
74
|
return Math.min(Math.max(1, Math.floor(limit)), MAX_BUNDLE_LIMIT);
|
|
75
75
|
}
|
|
76
|
+
/**
|
|
77
|
+
* トークン化戦略を取得
|
|
78
|
+
* 環境変数またはデフォルト値から決定
|
|
79
|
+
*/
|
|
80
|
+
function getTokenizationStrategy() {
|
|
81
|
+
const strategy = process.env.KIRI_TOKENIZATION_STRATEGY?.toLowerCase();
|
|
82
|
+
if (strategy === "legacy" || strategy === "hybrid") {
|
|
83
|
+
return strategy;
|
|
84
|
+
}
|
|
85
|
+
return "phrase-aware"; // デフォルト
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* 引用符で囲まれたフレーズを抽出
|
|
89
|
+
* 例: 'search "page-agent handler" test' → ["page-agent handler"]
|
|
90
|
+
*/
|
|
91
|
+
function extractQuotedPhrases(text) {
|
|
92
|
+
const phrases = [];
|
|
93
|
+
const quotePattern = /"([^"]+)"|'([^']+)'/g;
|
|
94
|
+
let match;
|
|
95
|
+
let remaining = text;
|
|
96
|
+
// eslint-disable-next-line no-cond-assign
|
|
97
|
+
while ((match = quotePattern.exec(text)) !== null) {
|
|
98
|
+
const phrase = (match[1] || match[2] || "").trim().toLowerCase();
|
|
99
|
+
if (phrase.length >= 3) {
|
|
100
|
+
phrases.push(phrase);
|
|
101
|
+
}
|
|
102
|
+
remaining = remaining.replace(match[0], " ");
|
|
103
|
+
}
|
|
104
|
+
return { phrases, remaining };
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* 複合用語を抽出(ハイフンまたはアンダースコア区切り)
|
|
108
|
+
* Unicode文字に対応(日本語、中国語などの複合用語もサポート)
|
|
109
|
+
* 例: "page-agent lambda-handler" → ["page-agent", "lambda-handler"]
|
|
110
|
+
* 例: "user_profile file_embedding" → ["user_profile", "file_embedding"]
|
|
111
|
+
* 例: "app-日本語" → ["app-日本語"]
|
|
112
|
+
*/
|
|
113
|
+
function extractCompoundTerms(text) {
|
|
114
|
+
// Unicode対応: ハイフン(-)とアンダースコア(_)の両方をサポート
|
|
115
|
+
// snake_case (Python/Rust) と kebab-case を同等に扱う
|
|
116
|
+
// 注: \b はアンダースコアを単語文字として扱うため、アンダースコアでは機能しない
|
|
117
|
+
// そのため、明示的な境界チェックを使用
|
|
118
|
+
const compoundPattern = /(?:^|\s|[^\p{L}\p{N}_-])([\p{L}\p{N}]+(?:[-_][\p{L}\p{N}]+)+)(?=\s|[^\p{L}\p{N}_-]|$)/giu;
|
|
119
|
+
const matches = Array.from(text.matchAll(compoundPattern)).map((m) => m[1]);
|
|
120
|
+
return matches
|
|
121
|
+
.map((term) => term.toLowerCase())
|
|
122
|
+
.filter((term) => term.length >= 3 && !STOP_WORDS.has(term));
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* パスライクな用語を抽出
|
|
126
|
+
* Unicode文字に対応
|
|
127
|
+
* 例: "lambda/page-agent/handler" → ["lambda", "page-agent", "handler"]
|
|
128
|
+
*/
|
|
129
|
+
function extractPathSegments(text) {
|
|
130
|
+
// Unicode対応: パスセグメントでもUnicode文字をサポート
|
|
131
|
+
const pathPattern = /\b[\p{L}\p{N}_-]+(?:\/[\p{L}\p{N}_-]+)+\b/giu;
|
|
132
|
+
const matches = text.match(pathPattern) || [];
|
|
133
|
+
const segments = [];
|
|
134
|
+
for (const path of matches) {
|
|
135
|
+
const parts = path.toLowerCase().split("/");
|
|
136
|
+
for (const part of parts) {
|
|
137
|
+
if (part.length >= 3 && !STOP_WORDS.has(part) && !segments.includes(part)) {
|
|
138
|
+
segments.push(part);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return segments;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* 通常の単語を抽出
|
|
146
|
+
* 共有トークン化ユーティリティを使用
|
|
147
|
+
*/
|
|
148
|
+
function extractRegularWords(text, strategy) {
|
|
149
|
+
const words = tokenizeText(text, strategy).filter((word) => word.length >= 3 && !STOP_WORDS.has(word));
|
|
150
|
+
return words;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* テキストからキーワード、フレーズ、パスセグメントを抽出
|
|
154
|
+
* トークン化戦略に基づいて、ハイフン区切り用語の処理方法を変更
|
|
155
|
+
*/
|
|
76
156
|
function extractKeywords(text) {
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
157
|
+
const strategy = getTokenizationStrategy();
|
|
158
|
+
const result = {
|
|
159
|
+
phrases: [],
|
|
160
|
+
keywords: [],
|
|
161
|
+
pathSegments: [],
|
|
162
|
+
};
|
|
163
|
+
// Phase 1: 引用符で囲まれたフレーズを抽出
|
|
164
|
+
const { phrases: quotedPhrases, remaining: afterQuotes } = extractQuotedPhrases(text);
|
|
165
|
+
result.phrases.push(...quotedPhrases);
|
|
166
|
+
// Phase 2: パスセグメントを抽出
|
|
167
|
+
const pathSegments = extractPathSegments(afterQuotes);
|
|
168
|
+
result.pathSegments.push(...pathSegments);
|
|
169
|
+
// Phase 3: 複合用語を抽出(ハイフン/アンダースコア区切り)(phrase-aware または hybrid モード)
|
|
170
|
+
if (strategy === "phrase-aware" || strategy === "hybrid") {
|
|
171
|
+
const compoundTerms = extractCompoundTerms(afterQuotes);
|
|
172
|
+
result.phrases.push(...compoundTerms);
|
|
173
|
+
// hybrid モードの場合、複合用語を分割したキーワードも追加
|
|
174
|
+
if (strategy === "hybrid") {
|
|
175
|
+
for (const term of compoundTerms) {
|
|
176
|
+
// ハイフンとアンダースコアの両方で分割
|
|
177
|
+
const parts = term
|
|
178
|
+
.split(/[-_]/)
|
|
179
|
+
.filter((part) => part.length >= 3 && !STOP_WORDS.has(part));
|
|
180
|
+
result.keywords.push(...parts);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
// Phase 4: 通常の単語を抽出
|
|
185
|
+
const regularWords = extractRegularWords(afterQuotes, strategy);
|
|
186
|
+
// 重複を除去しながら、最大キーワード数まで追加
|
|
187
|
+
for (const word of regularWords) {
|
|
188
|
+
if (!result.keywords.includes(word) && !result.phrases.includes(word)) {
|
|
189
|
+
result.keywords.push(word);
|
|
190
|
+
if (result.keywords.length >= MAX_KEYWORDS) {
|
|
87
191
|
break;
|
|
88
192
|
}
|
|
89
193
|
}
|
|
90
194
|
}
|
|
91
|
-
return
|
|
195
|
+
return result;
|
|
92
196
|
}
|
|
93
197
|
function ensureCandidate(map, filePath) {
|
|
94
198
|
let candidate = map.get(filePath);
|
|
@@ -321,13 +425,41 @@ function applyFileTypeBoost(path, baseScore, profile = "default") {
|
|
|
321
425
|
* @param row - ファイル情報(path, ext)
|
|
322
426
|
* @param profile - ブーストプロファイル
|
|
323
427
|
*/
|
|
324
|
-
function applyBoostProfile(candidate, row, profile) {
|
|
428
|
+
function applyBoostProfile(candidate, row, profile, extractedTerms, pathMatchWeight) {
|
|
325
429
|
if (profile === "none") {
|
|
326
430
|
return;
|
|
327
431
|
}
|
|
328
432
|
const { path, ext } = row;
|
|
329
433
|
const lowerPath = path.toLowerCase();
|
|
330
434
|
const fileName = path.split("/").pop() ?? "";
|
|
435
|
+
// パスベースのスコアリング: goalのキーワード/フレーズがファイルパスに含まれる場合にブースト
|
|
436
|
+
if (extractedTerms && pathMatchWeight && pathMatchWeight > 0) {
|
|
437
|
+
// フレーズがパスに完全一致する場合(最高の重み)
|
|
438
|
+
for (const phrase of extractedTerms.phrases) {
|
|
439
|
+
if (lowerPath.includes(phrase)) {
|
|
440
|
+
candidate.score += pathMatchWeight * 1.5; // 1.5倍のブースト
|
|
441
|
+
candidate.reasons.add(`path-phrase:${phrase}`);
|
|
442
|
+
break; // 最初のマッチのみ適用
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
// パスセグメントがマッチする場合(中程度の重み)
|
|
446
|
+
const pathParts = lowerPath.split("/");
|
|
447
|
+
for (const segment of extractedTerms.pathSegments) {
|
|
448
|
+
if (pathParts.includes(segment)) {
|
|
449
|
+
candidate.score += pathMatchWeight;
|
|
450
|
+
candidate.reasons.add(`path-segment:${segment}`);
|
|
451
|
+
break; // 最初のマッチのみ適用
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
// 通常のキーワードがパスに含まれる場合(低い重み)
|
|
455
|
+
for (const keyword of extractedTerms.keywords) {
|
|
456
|
+
if (lowerPath.includes(keyword)) {
|
|
457
|
+
candidate.score += pathMatchWeight * 0.5; // 0.5倍のブースト
|
|
458
|
+
candidate.reasons.add(`path-keyword:${keyword}`);
|
|
459
|
+
break; // 最初のマッチのみ適用
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
331
463
|
// Blacklisted directories that are almost always irrelevant for code context
|
|
332
464
|
const blacklistedDirs = [
|
|
333
465
|
".cursor/",
|
|
@@ -425,7 +557,7 @@ function applyBoostProfile(candidate, row, profile) {
|
|
|
425
557
|
// Penalize documentation and other non-code files
|
|
426
558
|
const docExtensions = [".md", ".yaml", ".yml", ".mdc", ".json"];
|
|
427
559
|
if (docExtensions.some((docExt) => path.endsWith(docExt))) {
|
|
428
|
-
candidate.score -=
|
|
560
|
+
candidate.score -= 2.0; // Strong penalty to overcome doc-heavy keyword matches
|
|
429
561
|
candidate.reasons.add("penalty:doc-file");
|
|
430
562
|
}
|
|
431
563
|
// Boost implementation files, with more specific paths getting higher scores
|
|
@@ -649,18 +781,25 @@ export async function contextBundle(context, params) {
|
|
|
649
781
|
}
|
|
650
782
|
const semanticSeed = keywordSources.join(" ");
|
|
651
783
|
const queryEmbedding = generateEmbedding(semanticSeed)?.values ?? null;
|
|
652
|
-
|
|
653
|
-
|
|
784
|
+
const extractedTerms = extractKeywords(semanticSeed);
|
|
785
|
+
// フォールバック: editing_pathからキーワードを抽出
|
|
786
|
+
if (extractedTerms.phrases.length === 0 &&
|
|
787
|
+
extractedTerms.keywords.length === 0 &&
|
|
788
|
+
artifacts.editing_path) {
|
|
654
789
|
const pathSegments = artifacts.editing_path
|
|
655
790
|
.split(/[/_.-]/)
|
|
656
791
|
.map((segment) => segment.toLowerCase())
|
|
657
792
|
.filter((segment) => segment.length >= 3 && !STOP_WORDS.has(segment));
|
|
658
|
-
|
|
793
|
+
extractedTerms.pathSegments.push(...pathSegments.slice(0, MAX_KEYWORDS));
|
|
659
794
|
}
|
|
660
795
|
const candidates = new Map();
|
|
661
796
|
const stringMatchSeeds = new Set();
|
|
662
797
|
const fileCache = new Map();
|
|
663
|
-
|
|
798
|
+
// フレーズマッチング(高い重み: textMatch × 2)- 統合クエリでパフォーマンス改善
|
|
799
|
+
if (extractedTerms.phrases.length > 0) {
|
|
800
|
+
const phrasePlaceholders = extractedTerms.phrases
|
|
801
|
+
.map(() => "b.content ILIKE '%' || ? || '%'")
|
|
802
|
+
.join(" OR ");
|
|
664
803
|
const rows = await db.all(`
|
|
665
804
|
SELECT f.path, f.lang, f.ext, f.is_binary, b.content, fe.vector_json, fe.dims AS vector_dims
|
|
666
805
|
FROM file f
|
|
@@ -670,21 +809,90 @@ export async function contextBundle(context, params) {
|
|
|
670
809
|
AND fe.path = f.path
|
|
671
810
|
WHERE f.repo_id = ?
|
|
672
811
|
AND f.is_binary = FALSE
|
|
673
|
-
AND
|
|
812
|
+
AND (${phrasePlaceholders})
|
|
674
813
|
ORDER BY f.path
|
|
675
814
|
LIMIT ?
|
|
676
|
-
`, [repoId,
|
|
815
|
+
`, [repoId, ...extractedTerms.phrases, MAX_MATCHES_PER_KEYWORD * extractedTerms.phrases.length]);
|
|
816
|
+
const boostProfile = params.boost_profile ?? "default";
|
|
677
817
|
for (const row of rows) {
|
|
678
818
|
if (row.content === null) {
|
|
679
819
|
continue;
|
|
680
820
|
}
|
|
821
|
+
// どのフレーズにマッチしたかをチェック
|
|
822
|
+
const lowerContent = row.content.toLowerCase();
|
|
823
|
+
const matchedPhrases = extractedTerms.phrases.filter((phrase) => lowerContent.includes(phrase));
|
|
824
|
+
if (matchedPhrases.length === 0) {
|
|
825
|
+
continue; // Should not happen, but defensive check
|
|
826
|
+
}
|
|
681
827
|
const candidate = ensureCandidate(candidates, row.path);
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
828
|
+
// 各マッチしたフレーズに対してスコアリング
|
|
829
|
+
for (const phrase of matchedPhrases) {
|
|
830
|
+
// フレーズマッチは通常の2倍のスコア
|
|
831
|
+
candidate.score += weights.textMatch * 2.0;
|
|
832
|
+
candidate.reasons.add(`phrase:${phrase}`);
|
|
833
|
+
}
|
|
834
|
+
// Apply boost profile once per file
|
|
835
|
+
applyBoostProfile(candidate, row, boostProfile, extractedTerms, weights.pathMatch);
|
|
836
|
+
// Use first matched phrase for preview (guaranteed to exist due to length check above)
|
|
837
|
+
const { line } = buildPreview(row.content, matchedPhrases[0]);
|
|
838
|
+
candidate.matchLine =
|
|
839
|
+
candidate.matchLine === null ? line : Math.min(candidate.matchLine, line);
|
|
840
|
+
candidate.content ??= row.content;
|
|
841
|
+
candidate.lang ??= row.lang;
|
|
842
|
+
candidate.ext ??= row.ext;
|
|
843
|
+
candidate.totalLines ??= row.content.length === 0 ? 0 : row.content.split(/\r?\n/).length;
|
|
844
|
+
candidate.embedding ??= parseEmbedding(row.vector_json ?? null, row.vector_dims ?? null);
|
|
845
|
+
stringMatchSeeds.add(row.path);
|
|
846
|
+
if (!fileCache.has(row.path)) {
|
|
847
|
+
fileCache.set(row.path, {
|
|
848
|
+
content: row.content,
|
|
849
|
+
lang: row.lang,
|
|
850
|
+
ext: row.ext,
|
|
851
|
+
totalLines: candidate.totalLines ?? 0,
|
|
852
|
+
embedding: candidate.embedding,
|
|
853
|
+
});
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
// キーワードマッチング(通常の重み)- 統合クエリでパフォーマンス改善
|
|
858
|
+
if (extractedTerms.keywords.length > 0) {
|
|
859
|
+
const keywordPlaceholders = extractedTerms.keywords
|
|
860
|
+
.map(() => "b.content ILIKE '%' || ? || '%'")
|
|
861
|
+
.join(" OR ");
|
|
862
|
+
const rows = await db.all(`
|
|
863
|
+
SELECT f.path, f.lang, f.ext, f.is_binary, b.content, fe.vector_json, fe.dims AS vector_dims
|
|
864
|
+
FROM file f
|
|
865
|
+
JOIN blob b ON b.hash = f.blob_hash
|
|
866
|
+
LEFT JOIN file_embedding fe
|
|
867
|
+
ON fe.repo_id = f.repo_id
|
|
868
|
+
AND fe.path = f.path
|
|
869
|
+
WHERE f.repo_id = ?
|
|
870
|
+
AND f.is_binary = FALSE
|
|
871
|
+
AND (${keywordPlaceholders})
|
|
872
|
+
ORDER BY f.path
|
|
873
|
+
LIMIT ?
|
|
874
|
+
`, [repoId, ...extractedTerms.keywords, MAX_MATCHES_PER_KEYWORD * extractedTerms.keywords.length]);
|
|
875
|
+
const boostProfile = params.boost_profile ?? "default";
|
|
876
|
+
for (const row of rows) {
|
|
877
|
+
if (row.content === null) {
|
|
878
|
+
continue;
|
|
879
|
+
}
|
|
880
|
+
// どのキーワードにマッチしたかをチェック
|
|
881
|
+
const lowerContent = row.content.toLowerCase();
|
|
882
|
+
const matchedKeywords = extractedTerms.keywords.filter((keyword) => lowerContent.includes(keyword));
|
|
883
|
+
if (matchedKeywords.length === 0) {
|
|
884
|
+
continue; // Should not happen, but defensive check
|
|
885
|
+
}
|
|
886
|
+
const candidate = ensureCandidate(candidates, row.path);
|
|
887
|
+
// 各マッチしたキーワードに対してスコアリング
|
|
888
|
+
for (const keyword of matchedKeywords) {
|
|
889
|
+
candidate.score += weights.textMatch;
|
|
890
|
+
candidate.reasons.add(`text:${keyword}`);
|
|
891
|
+
}
|
|
892
|
+
// Apply boost profile once per file
|
|
893
|
+
applyBoostProfile(candidate, row, boostProfile, extractedTerms, weights.pathMatch);
|
|
894
|
+
// Use first matched keyword for preview (guaranteed to exist due to length check above)
|
|
895
|
+
const { line } = buildPreview(row.content, matchedKeywords[0]);
|
|
688
896
|
candidate.matchLine =
|
|
689
897
|
candidate.matchLine === null ? line : Math.min(candidate.matchLine, line);
|
|
690
898
|
candidate.content ??= row.content;
|