persyst-mcp 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/init.js +7 -0
- package/index.js +41 -0
- package/package.json +2 -2
- package/src/database.js +926 -877
- package/src/extractor-heuristic.js +324 -250
- package/src/git.js +9 -3
- package/src/search.js +561 -456
- package/src/server.js +72 -67
- package/src/tools.js +127 -16
- package/src/watcher.js +306 -0
|
@@ -1,250 +1,324 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* extractor-heuristic.js — Tier 2: Zero-Cost Regex-Based Fact Extractor
|
|
3
|
-
*
|
|
4
|
-
* Scans raw conversation text for explicit developer preference signals:
|
|
5
|
-
* "I prefer...", "we decided...", "always use...", "stack includes..."
|
|
6
|
-
*
|
|
7
|
-
* Design decisions:
|
|
8
|
-
* - Runs synchronously — zero latency overhead on the hot path
|
|
9
|
-
* - Conservative extraction: high-precision, low-recall
|
|
10
|
-
* - Returns structured facts with confidence scores (0.0 - 1.0)
|
|
11
|
-
* - Deduplication-ready: facts are normalized before output
|
|
12
|
-
*
|
|
13
|
-
* This is NOT the primary extraction tier. It's a lightweight safety net
|
|
14
|
-
* that catches the most obvious signals when Tier 3 (LLM) is unavailable
|
|
15
|
-
* or still processing asynchronously.
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
// ============================================================
|
|
19
|
-
// PATTERN DEFINITIONS
|
|
20
|
-
// Ordered by specificity — most specific patterns first
|
|
21
|
-
// Each pattern has: regex, category, confidence, and a template
|
|
22
|
-
// to normalize the matched text into a clean fact statement.
|
|
23
|
-
// ============================================================
|
|
24
|
-
|
|
25
|
-
const PATTERNS = [
|
|
26
|
-
// --- Decision patterns (highest confidence) ---
|
|
27
|
-
{
|
|
28
|
-
regex: /(?:we|i|the team)\s+(?:have\s+)?decided\s+(?:to\s+)?(?:use|go\s+with|adopt|switch\s+to|move\s+to)\s+(.+?)(?:\.|$)/gi,
|
|
29
|
-
category: 'decision',
|
|
30
|
-
confidence: 0.85,
|
|
31
|
-
template: (match) => `Decision: ${cleanFact(match[1])}`
|
|
32
|
-
},
|
|
33
|
-
{
|
|
34
|
-
regex: /(?:we(?:'re|\s+are)?\s+)?(?:going|moving)\s+(?:to\s+)?(?:use|adopt|switch\s+to|migrate\s+to)\s+(.+?)(?:\s+(?:for|because|since|as)\b|\.|$)/gi,
|
|
35
|
-
category: 'decision',
|
|
36
|
-
confidence: 0.80,
|
|
37
|
-
template: (match) => `Decision: Moving to ${cleanFact(match[1])}`
|
|
38
|
-
},
|
|
39
|
-
|
|
40
|
-
// --- Explicit preference patterns ---
|
|
41
|
-
{
|
|
42
|
-
regex: /i\s+(?:always\s+)?prefer\s+(.+?)(?:\s+(?:over|instead\s+of|rather\s+than)\s+(.+?))?(?:\.|$)/gi,
|
|
43
|
-
category: 'preference',
|
|
44
|
-
confidence: 0.80,
|
|
45
|
-
template: (match) => {
|
|
46
|
-
const pref = cleanFact(match[1]);
|
|
47
|
-
const alt = match[2] ? ` over ${cleanFact(match[2])}` : '';
|
|
48
|
-
return `Preference: ${pref}${alt}`;
|
|
49
|
-
}
|
|
50
|
-
},
|
|
51
|
-
{
|
|
52
|
-
regex: /(?:we|i)\s+(?:should\s+)?(?:always|never)\s+(?:use|avoid|include|add|write|create)\s+(.+?)(?:\.|$)/gi,
|
|
53
|
-
category: 'preference',
|
|
54
|
-
confidence: 0.75,
|
|
55
|
-
template: (match) => `Rule: ${cleanFact(match[0])}`
|
|
56
|
-
},
|
|
57
|
-
|
|
58
|
-
// --- Stack / technology patterns ---
|
|
59
|
-
{
|
|
60
|
-
regex: /(?:our|the|my)\s+(?:tech\s+)?stack\s+(?:includes?|uses?|is|has)\s+(.+?)(?:\.\s|\.$|$)/gim,
|
|
61
|
-
category: 'stack',
|
|
62
|
-
confidence: 0.85,
|
|
63
|
-
template: (match) => `Stack: ${cleanFact(match[1])}`
|
|
64
|
-
},
|
|
65
|
-
{
|
|
66
|
-
regex: /(?:we(?:'re|\s+are)?\s+)?using\s+(.+?)\s+(?:for|as)\s+(?:our|the)\s+(.+?)(?:\.|$)/gi,
|
|
67
|
-
category: 'stack',
|
|
68
|
-
confidence: 0.80,
|
|
69
|
-
template: (match) => `Stack: Using ${cleanFact(match[1])} for ${cleanFact(match[2])}`
|
|
70
|
-
},
|
|
71
|
-
{
|
|
72
|
-
regex: /(?:our|the)\s+(?:backend|frontend|database|api|server|client|infra(?:structure)?)\s+(?:is|uses?|runs?\s+on)\s+(.+?)(?:\.|$)/gi,
|
|
73
|
-
category: 'stack',
|
|
74
|
-
confidence: 0.80,
|
|
75
|
-
template: (match) => `Stack: ${cleanFact(match[0])}`
|
|
76
|
-
},
|
|
77
|
-
|
|
78
|
-
// --- Naming / convention patterns ---
|
|
79
|
-
{
|
|
80
|
-
regex: /(?:name|call|rename)\s+(?:it|this|the\s+\w+)\s+["'`]?(\w[\w\-\.]+)["'`]?/gi,
|
|
81
|
-
category: 'naming',
|
|
82
|
-
confidence: 0.70,
|
|
83
|
-
template: (match) => `Naming: ${cleanFact(match[0])}`
|
|
84
|
-
},
|
|
85
|
-
|
|
86
|
-
// --- Architecture patterns ---
|
|
87
|
-
{
|
|
88
|
-
regex: /(?:the\s+)?(?:project|app|application|system|architecture)\s+(?:follows?|uses?|is\s+based\s+on|implements?)\s+(.+?)(?:\s+pattern|\s+architecture)?(?:\.|$)/gi,
|
|
89
|
-
category: 'architecture',
|
|
90
|
-
confidence: 0.80,
|
|
91
|
-
template: (match) => `Architecture: ${cleanFact(match[1])}`
|
|
92
|
-
},
|
|
93
|
-
|
|
94
|
-
// --- Coding rule / style patterns ---
|
|
95
|
-
{
|
|
96
|
-
regex: /(?:always|never|must|should|don't|do\s+not)\s+(?:use|write|create|add|include|put|place|keep)\s+(.+?)(?:\.|$)/gi,
|
|
97
|
-
category: 'rule',
|
|
98
|
-
confidence: 0.70,
|
|
99
|
-
template: (match) => `Rule: ${cleanFact(match[0])}`
|
|
100
|
-
},
|
|
101
|
-
|
|
102
|
-
// --- Config / env patterns ---
|
|
103
|
-
{
|
|
104
|
-
regex: /(?:set|change|update|configure)\s+(?:the\s+)?(?:port|host|env|environment|config|setting)\s+(?:to|=|:)\s*["'`]?(.+?)["'`]?(?:\.|$)/gi,
|
|
105
|
-
category: 'config',
|
|
106
|
-
confidence: 0.75,
|
|
107
|
-
template: (match) => `Config: ${cleanFact(match[0])}`
|
|
108
|
-
}
|
|
109
|
-
];
|
|
110
|
-
|
|
111
|
-
// ============================================================
|
|
112
|
-
// NOISE FILTERS
|
|
113
|
-
// Skip lines that look like code, errors, or system output
|
|
114
|
-
// ============================================================
|
|
115
|
-
|
|
116
|
-
const NOISE_PATTERNS = [
|
|
117
|
-
/^[\s]*(?:import|export|const|let|var|function|class|if|else|for|while|return|throw|try|catch)\s/,
|
|
118
|
-
/^[\s]*[{}\[\]();]/,
|
|
119
|
-
/^[\s]*\/\//,
|
|
120
|
-
/^[\s]*\*/,
|
|
121
|
-
/^[\s]*```/,
|
|
122
|
-
/^\s*$/,
|
|
123
|
-
/^(?:error|warning|info|debug|trace):/i,
|
|
124
|
-
/^\s*at\s+\w+/, // stack trace lines
|
|
125
|
-
/^[A-Z_]{2,}=/, // ENV variable assignments
|
|
126
|
-
/^\d{4}-\d{2}-\d{2}/, // timestamp lines
|
|
127
|
-
];
|
|
128
|
-
|
|
129
|
-
/**
|
|
130
|
-
* Check if a line looks like noise (code, logs, etc.)
|
|
131
|
-
* @param {string} line
|
|
132
|
-
* @returns {boolean}
|
|
133
|
-
*/
|
|
134
|
-
function isNoiseLine(line) {
|
|
135
|
-
return NOISE_PATTERNS.some(p => p.test(line));
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
// ============================================================
|
|
139
|
-
// FACT NORMALIZATION
|
|
140
|
-
// ============================================================
|
|
141
|
-
|
|
142
|
-
/**
|
|
143
|
-
* Clean and normalize an extracted fact string.
|
|
144
|
-
* Removes trailing punctuation, excess whitespace, and truncates.
|
|
145
|
-
* @param {string} raw
|
|
146
|
-
* @returns {string}
|
|
147
|
-
*/
|
|
148
|
-
function cleanFact(raw) {
|
|
149
|
-
if (!raw) return '';
|
|
150
|
-
return raw
|
|
151
|
-
.trim()
|
|
152
|
-
.replace(/[\s]+/g, ' ') // collapse whitespace
|
|
153
|
-
.replace(/[,;:]+$/, '') // strip trailing punctuation
|
|
154
|
-
.replace(/^["'`]+|["'`]+$/g, '') // strip quotes
|
|
155
|
-
.slice(0, 200); // hard max fact length
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
//
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
*
|
|
237
|
-
*
|
|
238
|
-
*
|
|
239
|
-
* @param {
|
|
240
|
-
* @
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
1
|
+
/**
|
|
2
|
+
* extractor-heuristic.js — Tier 2: Zero-Cost Regex-Based Fact Extractor
|
|
3
|
+
*
|
|
4
|
+
* Scans raw conversation text for explicit developer preference signals:
|
|
5
|
+
* "I prefer...", "we decided...", "always use...", "stack includes..."
|
|
6
|
+
*
|
|
7
|
+
* Design decisions:
|
|
8
|
+
* - Runs synchronously — zero latency overhead on the hot path
|
|
9
|
+
* - Conservative extraction: high-precision, low-recall
|
|
10
|
+
* - Returns structured facts with confidence scores (0.0 - 1.0)
|
|
11
|
+
* - Deduplication-ready: facts are normalized before output
|
|
12
|
+
*
|
|
13
|
+
* This is NOT the primary extraction tier. It's a lightweight safety net
|
|
14
|
+
* that catches the most obvious signals when Tier 3 (LLM) is unavailable
|
|
15
|
+
* or still processing asynchronously.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// ============================================================
|
|
19
|
+
// PATTERN DEFINITIONS
|
|
20
|
+
// Ordered by specificity — most specific patterns first
|
|
21
|
+
// Each pattern has: regex, category, confidence, and a template
|
|
22
|
+
// to normalize the matched text into a clean fact statement.
|
|
23
|
+
// ============================================================
|
|
24
|
+
|
|
25
|
+
const PATTERNS = [
|
|
26
|
+
// --- Decision patterns (highest confidence) ---
|
|
27
|
+
{
|
|
28
|
+
regex: /(?:we|i|the team)\s+(?:have\s+)?decided\s+(?:to\s+)?(?:use|go\s+with|adopt|switch\s+to|move\s+to)\s+(.+?)(?:\.|$)/gi,
|
|
29
|
+
category: 'decision',
|
|
30
|
+
confidence: 0.85,
|
|
31
|
+
template: (match) => `Decision: ${cleanFact(match[1])}`
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
regex: /(?:we(?:'re|\s+are)?\s+)?(?:going|moving)\s+(?:to\s+)?(?:use|adopt|switch\s+to|migrate\s+to)\s+(.+?)(?:\s+(?:for|because|since|as)\b|\.|$)/gi,
|
|
35
|
+
category: 'decision',
|
|
36
|
+
confidence: 0.80,
|
|
37
|
+
template: (match) => `Decision: Moving to ${cleanFact(match[1])}`
|
|
38
|
+
},
|
|
39
|
+
|
|
40
|
+
// --- Explicit preference patterns ---
|
|
41
|
+
{
|
|
42
|
+
regex: /i\s+(?:always\s+)?prefer\s+(.+?)(?:\s+(?:over|instead\s+of|rather\s+than)\s+(.+?))?(?:\.|$)/gi,
|
|
43
|
+
category: 'preference',
|
|
44
|
+
confidence: 0.80,
|
|
45
|
+
template: (match) => {
|
|
46
|
+
const pref = cleanFact(match[1]);
|
|
47
|
+
const alt = match[2] ? ` over ${cleanFact(match[2])}` : '';
|
|
48
|
+
return `Preference: ${pref}${alt}`;
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
regex: /(?:we|i)\s+(?:should\s+)?(?:always|never)\s+(?:use|avoid|include|add|write|create)\s+(.+?)(?:\.|$)/gi,
|
|
53
|
+
category: 'preference',
|
|
54
|
+
confidence: 0.75,
|
|
55
|
+
template: (match) => `Rule: ${cleanFact(match[0])}`
|
|
56
|
+
},
|
|
57
|
+
|
|
58
|
+
// --- Stack / technology patterns ---
|
|
59
|
+
{
|
|
60
|
+
regex: /(?:our|the|my)\s+(?:tech\s+)?stack\s+(?:includes?|uses?|is|has)\s+(.+?)(?:\.\s|\.$|$)/gim,
|
|
61
|
+
category: 'stack',
|
|
62
|
+
confidence: 0.85,
|
|
63
|
+
template: (match) => `Stack: ${cleanFact(match[1])}`
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
regex: /(?:we(?:'re|\s+are)?\s+)?using\s+(.+?)\s+(?:for|as)\s+(?:our|the)\s+(.+?)(?:\.|$)/gi,
|
|
67
|
+
category: 'stack',
|
|
68
|
+
confidence: 0.80,
|
|
69
|
+
template: (match) => `Stack: Using ${cleanFact(match[1])} for ${cleanFact(match[2])}`
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
regex: /(?:our|the)\s+(?:backend|frontend|database|api|server|client|infra(?:structure)?)\s+(?:is|uses?|runs?\s+on)\s+(.+?)(?:\.|$)/gi,
|
|
73
|
+
category: 'stack',
|
|
74
|
+
confidence: 0.80,
|
|
75
|
+
template: (match) => `Stack: ${cleanFact(match[0])}`
|
|
76
|
+
},
|
|
77
|
+
|
|
78
|
+
// --- Naming / convention patterns ---
|
|
79
|
+
{
|
|
80
|
+
regex: /(?:name|call|rename)\s+(?:it|this|the\s+\w+)\s+["'`]?(\w[\w\-\.]+)["'`]?/gi,
|
|
81
|
+
category: 'naming',
|
|
82
|
+
confidence: 0.70,
|
|
83
|
+
template: (match) => `Naming: ${cleanFact(match[0])}`
|
|
84
|
+
},
|
|
85
|
+
|
|
86
|
+
// --- Architecture patterns ---
|
|
87
|
+
{
|
|
88
|
+
regex: /(?:the\s+)?(?:project|app|application|system|architecture)\s+(?:follows?|uses?|is\s+based\s+on|implements?)\s+(.+?)(?:\s+pattern|\s+architecture)?(?:\.|$)/gi,
|
|
89
|
+
category: 'architecture',
|
|
90
|
+
confidence: 0.80,
|
|
91
|
+
template: (match) => `Architecture: ${cleanFact(match[1])}`
|
|
92
|
+
},
|
|
93
|
+
|
|
94
|
+
// --- Coding rule / style patterns ---
|
|
95
|
+
{
|
|
96
|
+
regex: /(?:always|never|must|should|don't|do\s+not)\s+(?:use|write|create|add|include|put|place|keep)\s+(.+?)(?:\.|$)/gi,
|
|
97
|
+
category: 'rule',
|
|
98
|
+
confidence: 0.70,
|
|
99
|
+
template: (match) => `Rule: ${cleanFact(match[0])}`
|
|
100
|
+
},
|
|
101
|
+
|
|
102
|
+
// --- Config / env patterns ---
|
|
103
|
+
{
|
|
104
|
+
regex: /(?:set|change|update|configure)\s+(?:the\s+)?(?:port|host|env|environment|config|setting)\s+(?:to|=|:)\s*["'`]?(.+?)["'`]?(?:\.|$)/gi,
|
|
105
|
+
category: 'config',
|
|
106
|
+
confidence: 0.75,
|
|
107
|
+
template: (match) => `Config: ${cleanFact(match[0])}`
|
|
108
|
+
}
|
|
109
|
+
];
|
|
110
|
+
|
|
111
|
+
// ============================================================
|
|
112
|
+
// NOISE FILTERS
|
|
113
|
+
// Skip lines that look like code, errors, or system output
|
|
114
|
+
// ============================================================
|
|
115
|
+
|
|
116
|
+
const NOISE_PATTERNS = [
|
|
117
|
+
/^[\s]*(?:import|export|const|let|var|function|class|if|else|for|while|return|throw|try|catch)\s/,
|
|
118
|
+
/^[\s]*[{}\[\]();]/,
|
|
119
|
+
/^[\s]*\/\//,
|
|
120
|
+
/^[\s]*\*/,
|
|
121
|
+
/^[\s]*```/,
|
|
122
|
+
/^\s*$/,
|
|
123
|
+
/^(?:error|warning|info|debug|trace):/i,
|
|
124
|
+
/^\s*at\s+\w+/, // stack trace lines
|
|
125
|
+
/^[A-Z_]{2,}=/, // ENV variable assignments
|
|
126
|
+
/^\d{4}-\d{2}-\d{2}/, // timestamp lines
|
|
127
|
+
];
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Check if a line looks like noise (code, logs, etc.)
|
|
131
|
+
* @param {string} line
|
|
132
|
+
* @returns {boolean}
|
|
133
|
+
*/
|
|
134
|
+
function isNoiseLine(line) {
|
|
135
|
+
return NOISE_PATTERNS.some(p => p.test(line));
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ============================================================
|
|
139
|
+
// FACT NORMALIZATION & COGNITIVE FILTER
|
|
140
|
+
// ============================================================
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Clean and normalize an extracted fact string.
|
|
144
|
+
* Removes trailing punctuation, excess whitespace, and truncates.
|
|
145
|
+
* @param {string} raw
|
|
146
|
+
* @returns {string}
|
|
147
|
+
*/
|
|
148
|
+
function cleanFact(raw) {
|
|
149
|
+
if (!raw) return '';
|
|
150
|
+
return raw
|
|
151
|
+
.trim()
|
|
152
|
+
.replace(/[\s]+/g, ' ') // collapse whitespace
|
|
153
|
+
.replace(/[,;:]+$/, '') // strip trailing punctuation
|
|
154
|
+
.replace(/^["'`]+|["'`]+$/g, '') // strip quotes
|
|
155
|
+
.slice(0, 200); // hard max fact length
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// List of programming/tech concepts to distinguish tech context from conversational filler
|
|
159
|
+
const TECH_CONCEPTS = [
|
|
160
|
+
'mode', 'theme', 'config', 'stack', 'style', 'code', 'file', 'folder', 'path',
|
|
161
|
+
'api', 'endpoint', 'json', 'data', 'db', 'database', 'table', 'migration',
|
|
162
|
+
'schema', 'sql', 'query', 'url', 'port', 'host', 'env', 'environment',
|
|
163
|
+
'node', 'npm', 'git', 'react', 'vue', 'angular', 'svelte', 'next', 'express',
|
|
164
|
+
'postgres', 'sqlite', 'mongo', 'mysql', 'docker', 'ubuntu', 'linux', 'server',
|
|
165
|
+
'pipeline', 'ci', 'cd', 'github', 'actions', 'oauth', 'auth', 'security',
|
|
166
|
+
'token', 'key', 'credential', 'package', 'dependency', 'library', 'script',
|
|
167
|
+
'test', 'jest', 'vitest', 'eslint', 'prettier', 'tailwind', 'css', 'html',
|
|
168
|
+
'js', 'ts', 'typescript', 'javascript', 'eval', 'function', 'class', 'component',
|
|
169
|
+
'import', 'export', 'require', 'const', 'let', 'var', 'compiler', 'build',
|
|
170
|
+
'cli', 'command', 'terminal', 'mcp', 'server', 'client', 'persyst', 'memory'
|
|
171
|
+
];
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Filter out conversational filler and keep only valid technical statements/preferences.
|
|
175
|
+
* @param {string} content - The extracted fact text
|
|
176
|
+
* @returns {boolean} - true if it is a valid, high-value fact
|
|
177
|
+
*/
|
|
178
|
+
function cognitiveNoiseFilter(content) {
|
|
179
|
+
const normalized = content.toLowerCase().trim();
|
|
180
|
+
|
|
181
|
+
// 1. Filter out interrogatives (questions)
|
|
182
|
+
const questionWords = ['how', 'why', 'what', 'where', 'when', 'who', 'can', 'could', 'would', 'is', 'are', 'should'];
|
|
183
|
+
if (normalized.endsWith('?')) return false;
|
|
184
|
+
for (const q of questionWords) {
|
|
185
|
+
if (normalized.startsWith(q + ' ') || normalized.includes(` ${q} `) || normalized.includes(`:${q} `)) {
|
|
186
|
+
if (normalized.includes(' ?') || normalized.endsWith('?')) return false;
|
|
187
|
+
if (/preference:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
|
|
188
|
+
if (/rule:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
|
|
189
|
+
if (/decision:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// 2. Filter out transient pronouns/vague statements without enough context
|
|
194
|
+
if (/preference:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
|
|
195
|
+
if (/decision:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
|
|
196
|
+
|
|
197
|
+
// 3. Filter out transient time references indicating very short-term state
|
|
198
|
+
const transientTerms = ['today', 'tomorrow', 'yesterday', 'now', 'just', 'temporary', 'currently', 'for now', 'briefly', 'at the moment'];
|
|
199
|
+
for (const term of transientTerms) {
|
|
200
|
+
if (normalized.includes(` ${term} `) || normalized.endsWith(` ${term}`)) {
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// 4. Filter out trace logs, build outputs, compile errors
|
|
206
|
+
if (normalized.includes('at ') && normalized.includes('.js:')) return false;
|
|
207
|
+
if (normalized.includes('error:') || normalized.includes('exception:')) return false;
|
|
208
|
+
if (normalized.includes('exit code') || normalized.includes('npm error')) return false;
|
|
209
|
+
|
|
210
|
+
// 5. Require at least one programming/project-related concept
|
|
211
|
+
const words = normalized.split(/[^a-zA-Z0-9\-\.\/]+/);
|
|
212
|
+
const hasTechTerm = words.some(w => {
|
|
213
|
+
return TECH_CONCEPTS.some(concept => {
|
|
214
|
+
if (concept.length <= 2) {
|
|
215
|
+
return w === concept;
|
|
216
|
+
}
|
|
217
|
+
return w.includes(concept);
|
|
218
|
+
}) ||
|
|
219
|
+
w.endsWith('.js') || w.endsWith('.json') || w.endsWith('.css') || w.endsWith('.md') ||
|
|
220
|
+
w.includes('/') || w.includes('\\');
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
if (!hasTechTerm) {
|
|
224
|
+
return false;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return true;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// ============================================================
|
|
231
|
+
// MAIN EXTRACTION FUNCTION
|
|
232
|
+
// ============================================================
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Extract facts from raw conversation text using regex heuristics.
|
|
236
|
+
*
|
|
237
|
+
* @param {string} text - Raw conversation text (user prompt or full turn)
|
|
238
|
+
* @param {Object} [options={}]
|
|
239
|
+
* @param {number} [options.minConfidence=0.65] - Minimum confidence to include a fact
|
|
240
|
+
* @param {number} [options.maxFacts=10] - Maximum facts to extract per call
|
|
241
|
+
* @returns {Array<{content: string, category: string, confidence: number}>}
|
|
242
|
+
*
|
|
243
|
+
* @example
|
|
244
|
+
* const facts = extractHeuristic("I prefer Postgres over SQLite for our backend database.");
|
|
245
|
+
* // => [{ content: "Preference: Postgres over SQLite", category: "preference", confidence: 0.80 }]
|
|
246
|
+
*/
|
|
247
|
+
export function extractHeuristic(text, options = {}) {
|
|
248
|
+
const {
|
|
249
|
+
minConfidence = 0.65,
|
|
250
|
+
maxFacts = 10
|
|
251
|
+
} = options;
|
|
252
|
+
|
|
253
|
+
if (!text || typeof text !== 'string' || text.length < 10) {
|
|
254
|
+
return [];
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const facts = [];
|
|
258
|
+
const seen = new Set(); // dedup by normalized content
|
|
259
|
+
|
|
260
|
+
// Process line-by-line to filter noise
|
|
261
|
+
const lines = text.split('\n');
|
|
262
|
+
const cleanLines = lines.filter(line => !isNoiseLine(line));
|
|
263
|
+
const cleanText = cleanLines.join('\n');
|
|
264
|
+
|
|
265
|
+
for (const pattern of PATTERNS) {
|
|
266
|
+
// Reset regex state for global matching
|
|
267
|
+
pattern.regex.lastIndex = 0;
|
|
268
|
+
|
|
269
|
+
let match;
|
|
270
|
+
while ((match = pattern.regex.exec(cleanText)) !== null) {
|
|
271
|
+
// Skip matches that are too short to be meaningful
|
|
272
|
+
if (match[0].length < 8) continue;
|
|
273
|
+
|
|
274
|
+
try {
|
|
275
|
+
const content = pattern.template(match);
|
|
276
|
+
if (!content || content.length < 5) continue;
|
|
277
|
+
|
|
278
|
+
if (!cognitiveNoiseFilter(content)) continue;
|
|
279
|
+
|
|
280
|
+
// Normalize for dedup
|
|
281
|
+
const key = content.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
282
|
+
if (seen.has(key)) continue;
|
|
283
|
+
seen.add(key);
|
|
284
|
+
|
|
285
|
+
if (pattern.confidence >= minConfidence) {
|
|
286
|
+
facts.push({
|
|
287
|
+
content,
|
|
288
|
+
category: pattern.category,
|
|
289
|
+
confidence: pattern.confidence
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (facts.length >= maxFacts) break;
|
|
294
|
+
} catch (_) {
|
|
295
|
+
// Template execution failed — skip this match
|
|
296
|
+
continue;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (facts.length >= maxFacts) break;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Sort by confidence descending
|
|
304
|
+
facts.sort((a, b) => b.confidence - a.confidence);
|
|
305
|
+
|
|
306
|
+
return facts;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Quick check: does this text contain any extractable signals?
|
|
311
|
+
* Cheaper than running full extraction — use as a gate.
|
|
312
|
+
*
|
|
313
|
+
* @param {string} text
|
|
314
|
+
* @returns {boolean}
|
|
315
|
+
*/
|
|
316
|
+
export function hasExtractableSignals(text) {
|
|
317
|
+
if (!text || text.length < 10) return false;
|
|
318
|
+
|
|
319
|
+
for (const pattern of PATTERNS) {
|
|
320
|
+
pattern.regex.lastIndex = 0;
|
|
321
|
+
if (pattern.regex.test(text)) return true;
|
|
322
|
+
}
|
|
323
|
+
return false;
|
|
324
|
+
}
|
package/src/git.js
CHANGED
|
@@ -55,8 +55,8 @@ export async function getRecentCommits(repoPath, count = 20) {
|
|
|
55
55
|
|
|
56
56
|
// Build a readable memory string
|
|
57
57
|
let fullText = body
|
|
58
|
-
? `[${hash.slice(0, 7)}] ${subject} — by ${author} on ${date}. ${body}`
|
|
59
|
-
: `[${hash.slice(0, 7)}] ${subject} — by ${author} on ${date}`;
|
|
58
|
+
? `[${hash.slice(0, 7)}] Commit: ${subject} — by ${author} on ${date}. ${body}`
|
|
59
|
+
: `[${hash.slice(0, 7)}] Commit: ${subject} — by ${author} on ${date}`;
|
|
60
60
|
|
|
61
61
|
if (notes) {
|
|
62
62
|
fullText += ` [PR Notes] ${notes}`;
|
|
@@ -86,7 +86,13 @@ export async function getRecentCommits(repoPath, count = 20) {
|
|
|
86
86
|
throw new Error(`Not a git repository: ${repoPath}`);
|
|
87
87
|
}
|
|
88
88
|
if (message.includes('ENOENT') || message.includes('not recognized')) {
|
|
89
|
-
throw new Error(
|
|
89
|
+
throw new Error(
|
|
90
|
+
'Git binary not found. Git is required to ingest commits.\n' +
|
|
91
|
+
'Please install Git and ensure it is added to your system PATH:\n' +
|
|
92
|
+
' - Windows: Download from https://git-scm.com/download/win\n' +
|
|
93
|
+
' - macOS: Run `brew install git` or install Xcode Command Line Tools\n' +
|
|
94
|
+
' - Linux: Run `sudo apt-get install git` or equivalent.'
|
|
95
|
+
);
|
|
90
96
|
}
|
|
91
97
|
throw new Error(`Failed to read git log: ${message}`);
|
|
92
98
|
}
|