persyst-mcp 2.2.4 → 2.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,324 +1,505 @@
1
- /**
2
- * extractor-heuristic.js — Tier 2: Zero-Cost Regex-Based Fact Extractor
3
- *
4
- * Scans raw conversation text for explicit developer preference signals:
5
- * "I prefer...", "we decided...", "always use...", "stack includes..."
6
- *
7
- * Design decisions:
8
- * - Runs synchronously zero latency overhead on the hot path
9
- * - Conservative extraction: high-precision, low-recall
10
- * - Returns structured facts with confidence scores (0.0 - 1.0)
11
- * - Deduplication-ready: facts are normalized before output
12
- *
13
- * This is NOT the primary extraction tier. It's a lightweight safety net
14
- * that catches the most obvious signals when Tier 3 (LLM) is unavailable
15
- * or still processing asynchronously.
16
- */
17
-
18
- // ============================================================
19
- // PATTERN DEFINITIONS
20
- // Ordered by specificity most specific patterns first
21
- // Each pattern has: regex, category, confidence, and a template
22
- // to normalize the matched text into a clean fact statement.
23
- // ============================================================
24
-
25
- const PATTERNS = [
26
- // --- Decision patterns (highest confidence) ---
27
- {
28
- regex: /(?:we|i|the team)\s+(?:have\s+)?decided\s+(?:to\s+)?(?:use|go\s+with|adopt|switch\s+to|move\s+to)\s+(.+?)(?:\.|$)/gi,
29
- category: 'decision',
30
- confidence: 0.85,
31
- template: (match) => `Decision: ${cleanFact(match[1])}`
32
- },
33
- {
34
- regex: /(?:we(?:'re|\s+are)?\s+)?(?:going|moving)\s+(?:to\s+)?(?:use|adopt|switch\s+to|migrate\s+to)\s+(.+?)(?:\s+(?:for|because|since|as)\b|\.|$)/gi,
35
- category: 'decision',
36
- confidence: 0.80,
37
- template: (match) => `Decision: Moving to ${cleanFact(match[1])}`
38
- },
39
-
40
- // --- Explicit preference patterns ---
41
- {
42
- regex: /i\s+(?:always\s+)?prefer\s+(.+?)(?:\s+(?:over|instead\s+of|rather\s+than)\s+(.+?))?(?:\.|$)/gi,
43
- category: 'preference',
44
- confidence: 0.80,
45
- template: (match) => {
46
- const pref = cleanFact(match[1]);
47
- const alt = match[2] ? ` over ${cleanFact(match[2])}` : '';
48
- return `Preference: ${pref}${alt}`;
49
- }
50
- },
51
- {
52
- regex: /(?:we|i)\s+(?:should\s+)?(?:always|never)\s+(?:use|avoid|include|add|write|create)\s+(.+?)(?:\.|$)/gi,
53
- category: 'preference',
54
- confidence: 0.75,
55
- template: (match) => `Rule: ${cleanFact(match[0])}`
56
- },
57
-
58
- // --- Stack / technology patterns ---
59
- {
60
- regex: /(?:our|the|my)\s+(?:tech\s+)?stack\s+(?:includes?|uses?|is|has)\s+(.+?)(?:\.\s|\.$|$)/gim,
61
- category: 'stack',
62
- confidence: 0.85,
63
- template: (match) => `Stack: ${cleanFact(match[1])}`
64
- },
65
- {
66
- regex: /(?:we(?:'re|\s+are)?\s+)?using\s+(.+?)\s+(?:for|as)\s+(?:our|the)\s+(.+?)(?:\.|$)/gi,
67
- category: 'stack',
68
- confidence: 0.80,
69
- template: (match) => `Stack: Using ${cleanFact(match[1])} for ${cleanFact(match[2])}`
70
- },
71
- {
72
- regex: /(?:our|the)\s+(?:backend|frontend|database|api|server|client|infra(?:structure)?)\s+(?:is|uses?|runs?\s+on)\s+(.+?)(?:\.|$)/gi,
73
- category: 'stack',
74
- confidence: 0.80,
75
- template: (match) => `Stack: ${cleanFact(match[0])}`
76
- },
77
-
78
- // --- Naming / convention patterns ---
79
- {
80
- regex: /(?:name|call|rename)\s+(?:it|this|the\s+\w+)\s+["'`]?(\w[\w\-\.]+)["'`]?/gi,
81
- category: 'naming',
82
- confidence: 0.70,
83
- template: (match) => `Naming: ${cleanFact(match[0])}`
84
- },
85
-
86
- // --- Architecture patterns ---
87
- {
88
- regex: /(?:the\s+)?(?:project|app|application|system|architecture)\s+(?:follows?|uses?|is\s+based\s+on|implements?)\s+(.+?)(?:\s+pattern|\s+architecture)?(?:\.|$)/gi,
89
- category: 'architecture',
90
- confidence: 0.80,
91
- template: (match) => `Architecture: ${cleanFact(match[1])}`
92
- },
93
-
94
- // --- Coding rule / style patterns ---
95
- {
96
- regex: /(?:always|never|must|should|don't|do\s+not)\s+(?:use|write|create|add|include|put|place|keep)\s+(.+?)(?:\.|$)/gi,
97
- category: 'rule',
98
- confidence: 0.70,
99
- template: (match) => `Rule: ${cleanFact(match[0])}`
100
- },
101
-
102
- // --- Config / env patterns ---
103
- {
104
- regex: /(?:set|change|update|configure)\s+(?:the\s+)?(?:port|host|env|environment|config|setting)\s+(?:to|=|:)\s*["'`]?(.+?)["'`]?(?:\.|$)/gi,
105
- category: 'config',
106
- confidence: 0.75,
107
- template: (match) => `Config: ${cleanFact(match[0])}`
108
- }
109
- ];
110
-
111
- // ============================================================
112
- // NOISE FILTERS
113
- // Skip lines that look like code, errors, or system output
114
- // ============================================================
115
-
116
- const NOISE_PATTERNS = [
117
- /^[\s]*(?:import|export|const|let|var|function|class|if|else|for|while|return|throw|try|catch)\s/,
118
- /^[\s]*[{}\[\]();]/,
119
- /^[\s]*\/\//,
120
- /^[\s]*\*/,
121
- /^[\s]*```/,
122
- /^\s*$/,
123
- /^(?:error|warning|info|debug|trace):/i,
124
- /^\s*at\s+\w+/, // stack trace lines
125
- /^[A-Z_]{2,}=/, // ENV variable assignments
126
- /^\d{4}-\d{2}-\d{2}/, // timestamp lines
127
- ];
128
-
129
- /**
130
- * Check if a line looks like noise (code, logs, etc.)
131
- * @param {string} line
132
- * @returns {boolean}
133
- */
134
- function isNoiseLine(line) {
135
- return NOISE_PATTERNS.some(p => p.test(line));
136
- }
137
-
138
- // ============================================================
139
- // FACT NORMALIZATION & COGNITIVE FILTER
140
- // ============================================================
141
-
142
- /**
143
- * Clean and normalize an extracted fact string.
144
- * Removes trailing punctuation, excess whitespace, and truncates.
145
- * @param {string} raw
146
- * @returns {string}
147
- */
148
- function cleanFact(raw) {
149
- if (!raw) return '';
150
- return raw
151
- .trim()
152
- .replace(/[\s]+/g, ' ') // collapse whitespace
153
- .replace(/[,;:]+$/, '') // strip trailing punctuation
154
- .replace(/^["'`]+|["'`]+$/g, '') // strip quotes
155
- .slice(0, 200); // hard max fact length
156
- }
157
-
158
- // List of programming/tech concepts to distinguish tech context from conversational filler
159
- const TECH_CONCEPTS = [
160
- 'mode', 'theme', 'config', 'stack', 'style', 'code', 'file', 'folder', 'path',
161
- 'api', 'endpoint', 'json', 'data', 'db', 'database', 'table', 'migration',
162
- 'schema', 'sql', 'query', 'url', 'port', 'host', 'env', 'environment',
163
- 'node', 'npm', 'git', 'react', 'vue', 'angular', 'svelte', 'next', 'express',
164
- 'postgres', 'sqlite', 'mongo', 'mysql', 'docker', 'ubuntu', 'linux', 'server',
165
- 'pipeline', 'ci', 'cd', 'github', 'actions', 'oauth', 'auth', 'security',
166
- 'token', 'key', 'credential', 'package', 'dependency', 'library', 'script',
167
- 'test', 'jest', 'vitest', 'eslint', 'prettier', 'tailwind', 'css', 'html',
168
- 'js', 'ts', 'typescript', 'javascript', 'eval', 'function', 'class', 'component',
169
- 'import', 'export', 'require', 'const', 'let', 'var', 'compiler', 'build',
170
- 'cli', 'command', 'terminal', 'mcp', 'server', 'client', 'persyst', 'memory'
171
- ];
172
-
173
- /**
174
- * Filter out conversational filler and keep only valid technical statements/preferences.
175
- * @param {string} content - The extracted fact text
176
- * @returns {boolean} - true if it is a valid, high-value fact
177
- */
178
- function cognitiveNoiseFilter(content) {
179
- const normalized = content.toLowerCase().trim();
180
-
181
- // 1. Filter out interrogatives (questions)
182
- const questionWords = ['how', 'why', 'what', 'where', 'when', 'who', 'can', 'could', 'would', 'is', 'are', 'should'];
183
- if (normalized.endsWith('?')) return false;
184
- for (const q of questionWords) {
185
- if (normalized.startsWith(q + ' ') || normalized.includes(` ${q} `) || normalized.includes(`:${q} `)) {
186
- if (normalized.includes(' ?') || normalized.endsWith('?')) return false;
187
- if (/preference:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
188
- if (/rule:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
189
- if (/decision:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
190
- }
191
- }
192
-
193
- // 2. Filter out transient pronouns/vague statements without enough context
194
- if (/preference:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
195
- if (/decision:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
196
-
197
- // 3. Filter out transient time references indicating very short-term state
198
- const transientTerms = ['today', 'tomorrow', 'yesterday', 'now', 'just', 'temporary', 'currently', 'for now', 'briefly', 'at the moment'];
199
- for (const term of transientTerms) {
200
- if (normalized.includes(` ${term} `) || normalized.endsWith(` ${term}`)) {
201
- return false;
202
- }
203
- }
204
-
205
- // 4. Filter out trace logs, build outputs, compile errors
206
- if (normalized.includes('at ') && normalized.includes('.js:')) return false;
207
- if (normalized.includes('error:') || normalized.includes('exception:')) return false;
208
- if (normalized.includes('exit code') || normalized.includes('npm error')) return false;
209
-
210
- // 5. Require at least one programming/project-related concept
211
- const words = normalized.split(/[^a-zA-Z0-9\-\.\/]+/);
212
- const hasTechTerm = words.some(w => {
213
- return TECH_CONCEPTS.some(concept => {
214
- if (concept.length <= 2) {
215
- return w === concept;
216
- }
217
- return w.includes(concept);
218
- }) ||
219
- w.endsWith('.js') || w.endsWith('.json') || w.endsWith('.css') || w.endsWith('.md') ||
220
- w.includes('/') || w.includes('\\');
221
- });
222
-
223
- if (!hasTechTerm) {
224
- return false;
225
- }
226
-
227
- return true;
228
- }
229
-
230
- // ============================================================
231
- // MAIN EXTRACTION FUNCTION
232
- // ============================================================
233
-
234
- /**
235
- * Extract facts from raw conversation text using regex heuristics.
236
- *
237
- * @param {string} text - Raw conversation text (user prompt or full turn)
238
- * @param {Object} [options={}]
239
- * @param {number} [options.minConfidence=0.65] - Minimum confidence to include a fact
240
- * @param {number} [options.maxFacts=10] - Maximum facts to extract per call
241
- * @returns {Array<{content: string, category: string, confidence: number}>}
242
- *
243
- * @example
244
- * const facts = extractHeuristic("I prefer Postgres over SQLite for our backend database.");
245
- * // => [{ content: "Preference: Postgres over SQLite", category: "preference", confidence: 0.80 }]
246
- */
247
- export function extractHeuristic(text, options = {}) {
248
- const {
249
- minConfidence = 0.65,
250
- maxFacts = 10
251
- } = options;
252
-
253
- if (!text || typeof text !== 'string' || text.length < 10) {
254
- return [];
255
- }
256
-
257
- const facts = [];
258
- const seen = new Set(); // dedup by normalized content
259
-
260
- // Process line-by-line to filter noise
261
- const lines = text.split('\n');
262
- const cleanLines = lines.filter(line => !isNoiseLine(line));
263
- const cleanText = cleanLines.join('\n');
264
-
265
- for (const pattern of PATTERNS) {
266
- // Reset regex state for global matching
267
- pattern.regex.lastIndex = 0;
268
-
269
- let match;
270
- while ((match = pattern.regex.exec(cleanText)) !== null) {
271
- // Skip matches that are too short to be meaningful
272
- if (match[0].length < 8) continue;
273
-
274
- try {
275
- const content = pattern.template(match);
276
- if (!content || content.length < 5) continue;
277
-
278
- if (!cognitiveNoiseFilter(content)) continue;
279
-
280
- // Normalize for dedup
281
- const key = content.toLowerCase().replace(/\s+/g, ' ').trim();
282
- if (seen.has(key)) continue;
283
- seen.add(key);
284
-
285
- if (pattern.confidence >= minConfidence) {
286
- facts.push({
287
- content,
288
- category: pattern.category,
289
- confidence: pattern.confidence
290
- });
291
- }
292
-
293
- if (facts.length >= maxFacts) break;
294
- } catch (_) {
295
- // Template execution failed — skip this match
296
- continue;
297
- }
298
- }
299
-
300
- if (facts.length >= maxFacts) break;
301
- }
302
-
303
- // Sort by confidence descending
304
- facts.sort((a, b) => b.confidence - a.confidence);
305
-
306
- return facts;
307
- }
308
-
309
- /**
310
- * Quick check: does this text contain any extractable signals?
311
- * Cheaper than running full extraction — use as a gate.
312
- *
313
- * @param {string} text
314
- * @returns {boolean}
315
- */
316
- export function hasExtractableSignals(text) {
317
- if (!text || text.length < 10) return false;
318
-
319
- for (const pattern of PATTERNS) {
320
- pattern.regex.lastIndex = 0;
321
- if (pattern.regex.test(text)) return true;
322
- }
323
- return false;
324
- }
1
+ /**
2
+ * extractor-heuristic.js — Tier 2: Zero-Cost Regex-Based Fact Extractor
3
+ *
4
+ * Scans raw conversation text for extractable knowledge signals.
5
+ *
6
+ * Operates in TWO modes:
7
+ *
8
+ * 1. EXPLICIT SAVE MODE (highest priority, bypasses all filters)
9
+ * Triggered when user says: "remember", "save this", "note:", "important:",
10
+ * "don't forget", "fyi", "keep in mind", "remind me", "make a note"
11
+ * These always get stored confidence 0.95. No tech filter applied.
12
+ * Examples:
13
+ * "Remember: the staging server is flaky on Mondays"
14
+ * "Note: John handles DB migrations, don't touch those files"
15
+ * "Don't forget the SSL cert expires March 15"
16
+ * "FYI the client doesn't want emojis in any responses"
17
+ *
18
+ * 2. IMPLICIT PATTERN MODE (normal extraction, requires tech context)
19
+ * Regex patterns for common developer signal phrases:
20
+ * "I prefer...", "we decided...", "always use...", "stack includes..."
21
+ * Conservative: high-precision, low-recall
22
+ * Filters non-technical content (noise filter)
23
+ *
24
+ * Design decisions:
25
+ * - Runs synchronously — zero latency overhead on the hot path
26
+ * - Returns structured facts with confidence scores (0.0 - 1.0)
27
+ * - Explicit saves always win — no filter can suppress them
28
+ */
29
+
30
+ // ============================================================
31
+ // EXPLICIT SAVE TRIGGERS
32
+ // These phrases indicate the user intentionally wants something saved.
33
+ // Order matters — more specific patterns come first.
34
+ // ============================================================
35
+
36
+ const EXPLICIT_SAVE_PATTERNS = [
37
+ // "remember: ..." / "remember that ..." / "remember to ..."
38
+ {
39
+ regex: /\bremember(?:\s*[:–—])?\s+(?:that\s+|to\s+)?(.+?)(?:\.|$)/gi,
40
+ category: 'note',
41
+ confidence: 0.95
42
+ },
43
+ // "note: ..." / "note that ..."
44
+ {
45
+ regex: /\bnote(?:\s*[:–—])\s*(.+?)(?:\.|$)/gi,
46
+ category: 'note',
47
+ confidence: 0.95
48
+ },
49
+ {
50
+ regex: /\bnote\s+that\s+(.+?)(?:\.|$)/gi,
51
+ category: 'note',
52
+ confidence: 0.95
53
+ },
54
+ // "important: ..."
55
+ {
56
+ regex: /\bimportant(?:\s*[:–—])\s*(.+?)(?:\.|$)/gi,
57
+ category: 'note',
58
+ confidence: 0.95
59
+ },
60
+ // "fyi: ..." / "fyi, ..."
61
+ {
62
+ regex: /\bfyi(?:\s*[:–—,])?\s*(.+?)(?:\.|$)/gi,
63
+ category: 'note',
64
+ confidence: 0.90
65
+ },
66
+ // "don't forget ..."
67
+ {
68
+ regex: /\bdon['']t\s+forget\s+(?:that\s+|to\s+)?(.+?)(?:\.|$)/gi,
69
+ category: 'reminder',
70
+ confidence: 0.90
71
+ },
72
+ // "keep in mind ..."
73
+ {
74
+ regex: /\bkeep\s+in\s+mind\s+(?:that\s+)?(.+?)(?:\.\s*$|$)/gi,
75
+ category: 'note',
76
+ confidence: 0.90
77
+ },
78
+ // "save this: ..." / "save that ..."
79
+ {
80
+ regex: /\bsave\s+(?:this|that|the following)(?:\s*[:–—])?\s*(.+?)(?:\.|$)/gi,
81
+ category: 'note',
82
+ confidence: 0.95
83
+ },
84
+ // "remind me ..." / "set a reminder ..."
85
+ {
86
+ regex: /\bremind\s+(?:me\s+)?(?:to\s+|that\s+|about\s+)?(.+?)(?:\.|$)/gi,
87
+ category: 'reminder',
88
+ confidence: 0.90
89
+ },
90
+ // "make a note ..." / "take a note ..."
91
+ {
92
+ regex: /\b(?:make|take)\s+a\s+note(?:\s*[:–—]|s?\s+that\s+|s?\s+about\s+|:?\s+)?(.+?)(?:\.|$)/gi,
93
+ category: 'note',
94
+ confidence: 0.90
95
+ },
96
+ // "heads up: ..." / "heads up, ..."
97
+ {
98
+ regex: /\bheads?\s+up(?:\s*[:–—,])?\s*(.+?)(?:\.|$)/gi,
99
+ category: 'note',
100
+ confidence: 0.90
101
+ },
102
+ // "warning: ..." (project context, not log output)
103
+ {
104
+ regex: /\bwarning(?:\s*[:–—])\s*(.+?)(?:\.|$)/gi,
105
+ category: 'note',
106
+ confidence: 0.85
107
+ },
108
+ // "caution: ..."
109
+ {
110
+ regex: /\bcaution(?:\s*[:–—])\s*(.+?)(?:\.|$)/gi,
111
+ category: 'note',
112
+ confidence: 0.85
113
+ },
114
+ // "the rule is ..." / "our rule is ..."
115
+ {
116
+ regex: /\b(?:the|our)\s+rule\s+is\s+(?:that\s+)?(.+?)(?:\.|$)/gi,
117
+ category: 'rule',
118
+ confidence: 0.90
119
+ }
120
+ ];
121
+
122
+ // ============================================================
123
+ // IMPLICIT PATTERN DEFINITIONS
124
+ // Ordered by specificity — most specific patterns first
125
+ // Each pattern: regex, category, confidence, template
126
+ // ============================================================
127
+
128
+ const PATTERNS = [
129
+ // --- Decision patterns (highest confidence) ---
130
+ {
131
+ regex: /(?:we|i|the team)\s+(?:have\s+)?decided\s+(?:to\s+)?(?:use|go\s+with|adopt|switch\s+to|move\s+to)\s+(.+?)(?:\.|$)/gi,
132
+ category: 'decision',
133
+ confidence: 0.85,
134
+ template: (match) => `Decision: ${cleanFact(match[1])}`
135
+ },
136
+ {
137
+ regex: /(?:we(?:'re|\s+are)?\s+)?(?:going|moving)\s+(?:to\s+)?(?:use|adopt|switch\s+to|migrate\s+to)\s+(.+?)(?:\s+(?:for|because|since|as)\b|\.|$)/gi,
138
+ category: 'decision',
139
+ confidence: 0.80,
140
+ template: (match) => `Decision: Moving to ${cleanFact(match[1])}`
141
+ },
142
+
143
+ // --- Explicit preference patterns ---
144
+ {
145
+ regex: /i\s+(?:always\s+)?prefer\s+(.+?)(?:\s+(?:over|instead\s+of|rather\s+than)\s+(.+?))?(?:\.|$)/gi,
146
+ category: 'preference',
147
+ confidence: 0.80,
148
+ template: (match) => {
149
+ const pref = cleanFact(match[1]);
150
+ const alt = match[2] ? ` over ${cleanFact(match[2])}` : '';
151
+ return `Preference: ${pref}${alt}`;
152
+ }
153
+ },
154
+ {
155
+ regex: /(?:we|i)\s+(?:should\s+)?(?:always|never)\s+(?:use|avoid|include|add|write|create)\s+(.+?)(?:\.|$)/gi,
156
+ category: 'preference',
157
+ confidence: 0.75,
158
+ template: (match) => `Rule: ${cleanFact(match[0])}`
159
+ },
160
+
161
+ // --- Stack / technology patterns ---
162
+ {
163
+ regex: /(?:our|the|my)\s+(?:tech\s+)?stack\s+(?:includes?|uses?|is|has)\s+(.+?)(?:\.\s|\.$|$)/gim,
164
+ category: 'stack',
165
+ confidence: 0.85,
166
+ template: (match) => `Stack: ${cleanFact(match[1])}`
167
+ },
168
+ {
169
+ regex: /(?:we(?:'re|\s+are)?\s+)?using\s+(.+?)\s+(?:for|as)\s+(?:our|the)\s+(.+?)(?:\.|$)/gi,
170
+ category: 'stack',
171
+ confidence: 0.80,
172
+ template: (match) => `Stack: Using ${cleanFact(match[1])} for ${cleanFact(match[2])}`
173
+ },
174
+ {
175
+ regex: /(?:our|the)\s+(?:backend|frontend|database|api|server|client|infra(?:structure)?)\s+(?:is|uses?|runs?\s+on)\s+(.+?)(?:\.|$)/gi,
176
+ category: 'stack',
177
+ confidence: 0.80,
178
+ template: (match) => `Stack: ${cleanFact(match[0])}`
179
+ },
180
+
181
+ // --- Naming / convention patterns ---
182
+ {
183
+ regex: /(?:name|call|rename)\s+(?:it|this|the\s+\w+)\s+[\"'`]?(\w[\w\-\.]+)[\"'`]?/gi,
184
+ category: 'naming',
185
+ confidence: 0.70,
186
+ template: (match) => `Naming: ${cleanFact(match[0])}`
187
+ },
188
+
189
+ // --- Architecture patterns ---
190
+ {
191
+ regex: /(?:the\s+)?(?:project|app|application|system|architecture)\s+(?:follows?|uses?|is\s+based\s+on|implements?)\s+(.+?)(?:\s+pattern|\s+architecture)?(?:\.|$)/gi,
192
+ category: 'architecture',
193
+ confidence: 0.80,
194
+ template: (match) => `Architecture: ${cleanFact(match[1])}`
195
+ },
196
+
197
+ // --- Coding rule / style patterns ---
198
+ {
199
+ regex: /(?:always|never|must|should|don't|do\s+not)\s+(?:use|write|create|add|include|put|place|keep)\s+(.+?)(?:\.|$)/gi,
200
+ category: 'rule',
201
+ confidence: 0.70,
202
+ template: (match) => `Rule: ${cleanFact(match[0])}`
203
+ },
204
+
205
+ // --- Config / env patterns ---
206
+ {
207
+ regex: /(?:set|change|update|configure)\s+(?:the\s+)?(?:port|host|env|environment|config|setting)\s+(?:to|=|:)\s*[\"'`]?(.+?)[\"'`]?(?:\.|$)/gi,
208
+ category: 'config',
209
+ confidence: 0.75,
210
+ template: (match) => `Config: ${cleanFact(match[0])}`
211
+ }
212
+ ];
213
+
214
+ // ============================================================
215
+ // NOISE FILTERS
216
+ // Skip lines that look like code, errors, or system output
217
+ // ============================================================
218
+
219
+ const NOISE_PATTERNS = [
220
+ /^[\s]*(?:import|export|const|let|var|function|class|if|else|for|while|return|throw|try|catch)\s/,
221
+ /^[\s]*[{}\[\]();]/,
222
+ /^[\s]*\/\//,
223
+ /^[\s]*\*/,
224
+ /^[\s]*```/,
225
+ /^\s*$/,
226
+ /^(?:error|warning|info|debug|trace):/i,
227
+ /^\s*at\s+\w+/, // stack trace lines
228
+ /^[A-Z_]{2,}=/, // ENV variable assignments
229
+ /^\d{4}-\d{2}-\d{2}/, // timestamp lines
230
+ ];
231
+
232
+ /**
233
+ * Check if a line looks like noise (code, logs, etc.)
234
+ * @param {string} line
235
+ * @returns {boolean}
236
+ */
237
+ function isNoiseLine(line) {
238
+ return NOISE_PATTERNS.some(p => p.test(line));
239
+ }
240
+
241
+ // ============================================================
242
+ // FACT NORMALIZATION & COGNITIVE FILTER
243
+ // ============================================================
244
+
245
+ /**
246
+ * Clean and normalize an extracted fact string.
247
+ * Removes trailing punctuation, excess whitespace, and truncates.
248
+ * @param {string} raw
249
+ * @returns {string}
250
+ */
251
+ function cleanFact(raw) {
252
+ if (!raw) return '';
253
+ return raw
254
+ .trim()
255
+ .replace(/[\s]+/g, ' ') // collapse whitespace
256
+ .replace(/[,;:]+$/, '') // strip trailing punctuation
257
+ .replace(/^["'`]+|["'`]+$/g, '') // strip quotes
258
+ .slice(0, 200); // hard max fact length
259
+ }
260
+
261
+ // List of programming/tech concepts to distinguish tech context from conversational filler
262
+ const TECH_CONCEPTS = [
263
+ 'mode', 'theme', 'config', 'stack', 'style', 'code', 'file', 'folder', 'path',
264
+ 'api', 'endpoint', 'json', 'data', 'db', 'database', 'table', 'migration',
265
+ 'schema', 'sql', 'query', 'url', 'port', 'host', 'env', 'environment',
266
+ 'node', 'npm', 'git', 'react', 'vue', 'angular', 'svelte', 'next', 'express',
267
+ 'postgres', 'sqlite', 'mongo', 'mysql', 'docker', 'ubuntu', 'linux', 'server',
268
+ 'pipeline', 'ci', 'cd', 'github', 'actions', 'oauth', 'auth', 'security',
269
+ 'token', 'key', 'credential', 'package', 'dependency', 'library', 'script',
270
+ 'test', 'jest', 'vitest', 'eslint', 'prettier', 'tailwind', 'css', 'html',
271
+ 'js', 'ts', 'typescript', 'javascript', 'eval', 'function', 'class', 'component',
272
+ 'import', 'export', 'require', 'const', 'let', 'var', 'compiler', 'build',
273
+ 'cli', 'command', 'terminal', 'mcp', 'server', 'client', 'persyst', 'memory'
274
+ ];
275
+
276
+ /**
277
+ * Filter out conversational filler and keep only valid technical statements/preferences.
278
+ * NOTE: This filter is ONLY applied to implicit pattern matches, NOT to explicit saves.
279
+ * @param {string} content - The extracted fact text
280
+ * @returns {boolean} - true if it is a valid, high-value fact
281
+ */
282
+ function cognitiveNoiseFilter(content) {
283
+ const normalized = content.toLowerCase().trim();
284
+
285
+ // 1. Filter out interrogatives (questions)
286
+ const questionWords = ['how', 'why', 'what', 'where', 'when', 'who', 'can', 'could', 'would', 'is', 'are', 'should'];
287
+ if (normalized.endsWith('?')) return false;
288
+ for (const q of questionWords) {
289
+ if (normalized.startsWith(q + ' ') || normalized.includes(` ${q} `) || normalized.includes(`:${q} `)) {
290
+ if (normalized.includes(' ?') || normalized.endsWith('?')) return false;
291
+ if (/preference:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
292
+ if (/rule:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
293
+ if (/decision:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
294
+ }
295
+ }
296
+
297
+ // 2. Filter out transient pronouns/vague statements without enough context
298
+ if (/preference:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
299
+ if (/decision:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
300
+
301
+ // 3. Filter out transient time references indicating very short-term state
302
+ const transientTerms = ['today', 'tomorrow', 'yesterday', 'now', 'just', 'temporary', 'currently', 'for now', 'briefly', 'at the moment'];
303
+ for (const term of transientTerms) {
304
+ if (normalized.includes(` ${term} `) || normalized.endsWith(` ${term}`)) {
305
+ return false;
306
+ }
307
+ }
308
+
309
+ // 4. Filter out trace logs, build outputs, compile errors
310
+ if (normalized.includes('at ') && normalized.includes('.js:')) return false;
311
+ if (normalized.includes('error:') || normalized.includes('exception:')) return false;
312
+ if (normalized.includes('exit code') || normalized.includes('npm error')) return false;
313
+
314
+ // 5. Require at least one programming/project-related concept
315
+ const words = normalized.split(/[^a-zA-Z0-9\-\.\/]+/);
316
+ const hasTechTerm = words.some(w => {
317
+ return TECH_CONCEPTS.some(concept => {
318
+ if (concept.length <= 2) {
319
+ return w === concept;
320
+ }
321
+ return w.includes(concept);
322
+ }) ||
323
+ w.endsWith('.js') || w.endsWith('.json') || w.endsWith('.css') || w.endsWith('.md') ||
324
+ w.includes('/') || w.includes('\\');
325
+ });
326
+
327
+ if (!hasTechTerm) {
328
+ return false;
329
+ }
330
+
331
+ return true;
332
+ }
333
+
334
+ // ============================================================
335
+ // EXPLICIT SAVE EXTRACTION
336
+ // Runs first. Bypasses all noise filters.
337
+ // The user said "remember this" — we save it, period.
338
+ // ============================================================
339
+
340
+ /**
341
+ * Extract explicitly-commanded saves from text.
342
+ * User phrases like "remember:", "note:", "don't forget" always get stored.
343
+ * No tech concept filter. No question filter. Confidence: 0.90–0.95.
344
+ *
345
+ * @param {string} text
346
+ * @returns {Array<{content: string, category: string, confidence: number, explicit: true}>}
347
+ */
348
+ function extractExplicitSaves(text) {
349
+ const results = [];
350
+ const seen = new Set();
351
+
352
+ for (const pattern of EXPLICIT_SAVE_PATTERNS) {
353
+ pattern.regex.lastIndex = 0;
354
+ let match;
355
+ while ((match = pattern.regex.exec(text)) !== null) {
356
+ const raw = match[1] || match[0];
357
+ const cleaned = cleanFact(raw);
358
+
359
+ // Minimum useful length
360
+ if (!cleaned || cleaned.length < 8) continue;
361
+
362
+ // Skip pure questions
363
+ if (cleaned.endsWith('?')) continue;
364
+
365
+ // Skip if this is just a meta-instruction to the system itself ("remember to search memories")
366
+ const metaWords = ['search_memories', 'add_memory', 'get_optimized_context', 'persyst tool'];
367
+ if (metaWords.some(w => cleaned.toLowerCase().includes(w))) continue;
368
+
369
+ const key = cleaned.toLowerCase().replace(/\s+/g, ' ').trim();
370
+ if (seen.has(key)) continue;
371
+ seen.add(key);
372
+
373
+ // Format the content with a Note:/Reminder: prefix if not already prefixed
374
+ let content = cleaned;
375
+ if (!/^(?:Note|Reminder|Rule|Important|Warning|Caution|FYI):/i.test(cleaned)) {
376
+ const prefix = pattern.category === 'reminder' ? 'Reminder' : 'Note';
377
+ content = `${prefix}: ${cleaned}`;
378
+ }
379
+
380
+ results.push({
381
+ content,
382
+ category: pattern.category,
383
+ confidence: pattern.confidence,
384
+ explicit: true // Mark as user-commanded — bypasses any downstream filters
385
+ });
386
+ }
387
+ }
388
+
389
+ return results;
390
+ }
391
+
392
+ // ============================================================
393
+ // MAIN EXTRACTION FUNCTION
394
+ // ============================================================
395
+
396
+ /**
397
+ * Extract facts from raw conversation text using regex heuristics.
398
+ *
399
+ * Runs in priority order:
400
+ * 1. Explicit saves ("remember:", "note:", "don't forget") — always stored
401
+ * 2. Implicit patterns (tech decisions, preferences, rules) — filtered
402
+ *
403
+ * @param {string} text - Raw conversation text (user prompt or full turn)
404
+ * @param {Object} [options={}]
405
+ * @param {number} [options.minConfidence=0.65] - Minimum confidence to include a fact
406
+ * @param {number} [options.maxFacts=15] - Maximum facts to extract per call
407
+ * @returns {Array<{content: string, category: string, confidence: number, explicit?: boolean}>}
408
+ *
409
+ * @example
410
+ * // Explicit save — bypasses all filters
411
+ * extractHeuristic("Remember: the staging server is flaky on Mondays")
412
+ * // => [{ content: "Note: the staging server is flaky on Mondays", category: "note", confidence: 0.95, explicit: true }]
413
+ *
414
+ * // Implicit pattern — goes through noise filter
415
+ * extractHeuristic("I prefer Postgres over SQLite for our backend database.")
416
+ * // => [{ content: "Preference: Postgres over SQLite", category: "preference", confidence: 0.80 }]
417
+ */
418
+ export function extractHeuristic(text, options = {}) {
419
+ const {
420
+ minConfidence = 0.65,
421
+ maxFacts = 15
422
+ } = options;
423
+
424
+ if (!text || typeof text !== 'string' || text.length < 10) {
425
+ return [];
426
+ }
427
+
428
+ // Strip all markdown fenced code blocks to prevent extracting facts from example code/logs
429
+ const cleanSourceText = text.replace(/```[\s\S]*?```/g, '');
430
+
431
+ // --- Step 1: Explicit saves (highest priority, no filter) ---
432
+ const explicitFacts = extractExplicitSaves(cleanSourceText);
433
+
434
+ // --- Step 2: Implicit pattern matching (filtered, tech-required) ---
435
+ const implicitFacts = [];
436
+ const seen = new Set(explicitFacts.map(f => f.content.toLowerCase().replace(/\s+/g, ' ').trim()));
437
+
438
+ // Process line-by-line to filter code/noise
439
+ const lines = cleanSourceText.split('\n');
440
+ const cleanLines = lines.filter(line => !isNoiseLine(line));
441
+ const cleanText = cleanLines.join('\n');
442
+
443
+ for (const pattern of PATTERNS) {
444
+ pattern.regex.lastIndex = 0;
445
+
446
+ let match;
447
+ while ((match = pattern.regex.exec(cleanText)) !== null) {
448
+ if (match[0].length < 8) continue;
449
+
450
+ try {
451
+ const content = pattern.template(match);
452
+ if (!content || content.length < 5) continue;
453
+
454
+ if (!cognitiveNoiseFilter(content)) continue;
455
+
456
+ const key = content.toLowerCase().replace(/\s+/g, ' ').trim();
457
+ if (seen.has(key)) continue;
458
+ seen.add(key);
459
+
460
+ if (pattern.confidence >= minConfidence) {
461
+ implicitFacts.push({
462
+ content,
463
+ category: pattern.category,
464
+ confidence: pattern.confidence
465
+ });
466
+ }
467
+
468
+ if (explicitFacts.length + implicitFacts.length >= maxFacts) break;
469
+ } catch (_) {
470
+ continue;
471
+ }
472
+ }
473
+
474
+ if (explicitFacts.length + implicitFacts.length >= maxFacts) break;
475
+ }
476
+
477
+ // Explicit facts first (user-commanded), then implicit sorted by confidence
478
+ implicitFacts.sort((a, b) => b.confidence - a.confidence);
479
+ return [...explicitFacts, ...implicitFacts];
480
+ }
481
+
482
+ /**
483
+ * Quick check: does this text contain any extractable signals?
484
+ * Cheaper than running full extraction — use as a gate.
485
+ *
486
+ * @param {string} text
487
+ * @returns {boolean}
488
+ */
489
+ export function hasExtractableSignals(text) {
490
+ if (!text || text.length < 10) return false;
491
+
492
+ // Check explicit save triggers first (very cheap)
493
+ for (const pattern of EXPLICIT_SAVE_PATTERNS) {
494
+ pattern.regex.lastIndex = 0;
495
+ if (pattern.regex.test(text)) return true;
496
+ }
497
+
498
+ // Then implicit patterns
499
+ for (const pattern of PATTERNS) {
500
+ pattern.regex.lastIndex = 0;
501
+ if (pattern.regex.test(text)) return true;
502
+ }
503
+
504
+ return false;
505
+ }