persyst-mcp 2.1.3 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,250 +1,324 @@
1
- /**
2
- * extractor-heuristic.js — Tier 2: Zero-Cost Regex-Based Fact Extractor
3
- *
4
- * Scans raw conversation text for explicit developer preference signals:
5
- * "I prefer...", "we decided...", "always use...", "stack includes..."
6
- *
7
- * Design decisions:
8
- * - Runs synchronously — zero latency overhead on the hot path
9
- * - Conservative extraction: high-precision, low-recall
10
- * - Returns structured facts with confidence scores (0.0 - 1.0)
11
- * - Deduplication-ready: facts are normalized before output
12
- *
13
- * This is NOT the primary extraction tier. It's a lightweight safety net
14
- * that catches the most obvious signals when Tier 3 (LLM) is unavailable
15
- * or still processing asynchronously.
16
- */
17
-
18
- // ============================================================
19
- // PATTERN DEFINITIONS
20
- // Ordered by specificity — most specific patterns first
21
- // Each pattern has: regex, category, confidence, and a template
22
- // to normalize the matched text into a clean fact statement.
23
- // ============================================================
24
-
25
- const PATTERNS = [
26
- // --- Decision patterns (highest confidence) ---
27
- {
28
- regex: /(?:we|i|the team)\s+(?:have\s+)?decided\s+(?:to\s+)?(?:use|go\s+with|adopt|switch\s+to|move\s+to)\s+(.+?)(?:\.|$)/gi,
29
- category: 'decision',
30
- confidence: 0.85,
31
- template: (match) => `Decision: ${cleanFact(match[1])}`
32
- },
33
- {
34
- regex: /(?:we(?:'re|\s+are)?\s+)?(?:going|moving)\s+(?:to\s+)?(?:use|adopt|switch\s+to|migrate\s+to)\s+(.+?)(?:\s+(?:for|because|since|as)\b|\.|$)/gi,
35
- category: 'decision',
36
- confidence: 0.80,
37
- template: (match) => `Decision: Moving to ${cleanFact(match[1])}`
38
- },
39
-
40
- // --- Explicit preference patterns ---
41
- {
42
- regex: /i\s+(?:always\s+)?prefer\s+(.+?)(?:\s+(?:over|instead\s+of|rather\s+than)\s+(.+?))?(?:\.|$)/gi,
43
- category: 'preference',
44
- confidence: 0.80,
45
- template: (match) => {
46
- const pref = cleanFact(match[1]);
47
- const alt = match[2] ? ` over ${cleanFact(match[2])}` : '';
48
- return `Preference: ${pref}${alt}`;
49
- }
50
- },
51
- {
52
- regex: /(?:we|i)\s+(?:should\s+)?(?:always|never)\s+(?:use|avoid|include|add|write|create)\s+(.+?)(?:\.|$)/gi,
53
- category: 'preference',
54
- confidence: 0.75,
55
- template: (match) => `Rule: ${cleanFact(match[0])}`
56
- },
57
-
58
- // --- Stack / technology patterns ---
59
- {
60
- regex: /(?:our|the|my)\s+(?:tech\s+)?stack\s+(?:includes?|uses?|is|has)\s+(.+?)(?:\.\s|\.$|$)/gim,
61
- category: 'stack',
62
- confidence: 0.85,
63
- template: (match) => `Stack: ${cleanFact(match[1])}`
64
- },
65
- {
66
- regex: /(?:we(?:'re|\s+are)?\s+)?using\s+(.+?)\s+(?:for|as)\s+(?:our|the)\s+(.+?)(?:\.|$)/gi,
67
- category: 'stack',
68
- confidence: 0.80,
69
- template: (match) => `Stack: Using ${cleanFact(match[1])} for ${cleanFact(match[2])}`
70
- },
71
- {
72
- regex: /(?:our|the)\s+(?:backend|frontend|database|api|server|client|infra(?:structure)?)\s+(?:is|uses?|runs?\s+on)\s+(.+?)(?:\.|$)/gi,
73
- category: 'stack',
74
- confidence: 0.80,
75
- template: (match) => `Stack: ${cleanFact(match[0])}`
76
- },
77
-
78
- // --- Naming / convention patterns ---
79
- {
80
- regex: /(?:name|call|rename)\s+(?:it|this|the\s+\w+)\s+["'`]?(\w[\w\-\.]+)["'`]?/gi,
81
- category: 'naming',
82
- confidence: 0.70,
83
- template: (match) => `Naming: ${cleanFact(match[0])}`
84
- },
85
-
86
- // --- Architecture patterns ---
87
- {
88
- regex: /(?:the\s+)?(?:project|app|application|system|architecture)\s+(?:follows?|uses?|is\s+based\s+on|implements?)\s+(.+?)(?:\s+pattern|\s+architecture)?(?:\.|$)/gi,
89
- category: 'architecture',
90
- confidence: 0.80,
91
- template: (match) => `Architecture: ${cleanFact(match[1])}`
92
- },
93
-
94
- // --- Coding rule / style patterns ---
95
- {
96
- regex: /(?:always|never|must|should|don't|do\s+not)\s+(?:use|write|create|add|include|put|place|keep)\s+(.+?)(?:\.|$)/gi,
97
- category: 'rule',
98
- confidence: 0.70,
99
- template: (match) => `Rule: ${cleanFact(match[0])}`
100
- },
101
-
102
- // --- Config / env patterns ---
103
- {
104
- regex: /(?:set|change|update|configure)\s+(?:the\s+)?(?:port|host|env|environment|config|setting)\s+(?:to|=|:)\s*["'`]?(.+?)["'`]?(?:\.|$)/gi,
105
- category: 'config',
106
- confidence: 0.75,
107
- template: (match) => `Config: ${cleanFact(match[0])}`
108
- }
109
- ];
110
-
111
- // ============================================================
112
- // NOISE FILTERS
113
- // Skip lines that look like code, errors, or system output
114
- // ============================================================
115
-
116
- const NOISE_PATTERNS = [
117
- /^[\s]*(?:import|export|const|let|var|function|class|if|else|for|while|return|throw|try|catch)\s/,
118
- /^[\s]*[{}\[\]();]/,
119
- /^[\s]*\/\//,
120
- /^[\s]*\*/,
121
- /^[\s]*```/,
122
- /^\s*$/,
123
- /^(?:error|warning|info|debug|trace):/i,
124
- /^\s*at\s+\w+/, // stack trace lines
125
- /^[A-Z_]{2,}=/, // ENV variable assignments
126
- /^\d{4}-\d{2}-\d{2}/, // timestamp lines
127
- ];
128
-
129
- /**
130
- * Check if a line looks like noise (code, logs, etc.)
131
- * @param {string} line
132
- * @returns {boolean}
133
- */
134
- function isNoiseLine(line) {
135
- return NOISE_PATTERNS.some(p => p.test(line));
136
- }
137
-
138
- // ============================================================
139
- // FACT NORMALIZATION
140
- // ============================================================
141
-
142
- /**
143
- * Clean and normalize an extracted fact string.
144
- * Removes trailing punctuation, excess whitespace, and truncates.
145
- * @param {string} raw
146
- * @returns {string}
147
- */
148
- function cleanFact(raw) {
149
- if (!raw) return '';
150
- return raw
151
- .trim()
152
- .replace(/[\s]+/g, ' ') // collapse whitespace
153
- .replace(/[,;:]+$/, '') // strip trailing punctuation
154
- .replace(/^["'`]+|["'`]+$/g, '') // strip quotes
155
- .slice(0, 200); // hard max fact length
156
- }
157
-
158
- // ============================================================
159
- // MAIN EXTRACTION FUNCTION
160
- // ============================================================
161
-
162
- /**
163
- * Extract facts from raw conversation text using regex heuristics.
164
- *
165
- * @param {string} text - Raw conversation text (user prompt or full turn)
166
- * @param {Object} [options={}]
167
- * @param {number} [options.minConfidence=0.65] - Minimum confidence to include a fact
168
- * @param {number} [options.maxFacts=10] - Maximum facts to extract per call
169
- * @returns {Array<{content: string, category: string, confidence: number}>}
170
- *
171
- * @example
172
- * const facts = extractHeuristic("I prefer Postgres over SQLite for our backend database.");
173
- * // => [{ content: "Preference: Postgres over SQLite", category: "preference", confidence: 0.80 }]
174
- */
175
- export function extractHeuristic(text, options = {}) {
176
- const {
177
- minConfidence = 0.65,
178
- maxFacts = 10
179
- } = options;
180
-
181
- if (!text || typeof text !== 'string' || text.length < 10) {
182
- return [];
183
- }
184
-
185
- const facts = [];
186
- const seen = new Set(); // dedup by normalized content
187
-
188
- // Process line-by-line to filter noise
189
- const lines = text.split('\n');
190
- const cleanLines = lines.filter(line => !isNoiseLine(line));
191
- const cleanText = cleanLines.join('\n');
192
-
193
- for (const pattern of PATTERNS) {
194
- // Reset regex state for global matching
195
- pattern.regex.lastIndex = 0;
196
-
197
- let match;
198
- while ((match = pattern.regex.exec(cleanText)) !== null) {
199
- // Skip matches that are too short to be meaningful
200
- if (match[0].length < 8) continue;
201
-
202
- try {
203
- const content = pattern.template(match);
204
- if (!content || content.length < 5) continue;
205
-
206
- // Normalize for dedup
207
- const key = content.toLowerCase().replace(/\s+/g, ' ').trim();
208
- if (seen.has(key)) continue;
209
- seen.add(key);
210
-
211
- if (pattern.confidence >= minConfidence) {
212
- facts.push({
213
- content,
214
- category: pattern.category,
215
- confidence: pattern.confidence
216
- });
217
- }
218
-
219
- if (facts.length >= maxFacts) break;
220
- } catch (_) {
221
- // Template execution failed — skip this match
222
- continue;
223
- }
224
- }
225
-
226
- if (facts.length >= maxFacts) break;
227
- }
228
-
229
- // Sort by confidence descending
230
- facts.sort((a, b) => b.confidence - a.confidence);
231
-
232
- return facts;
233
- }
234
-
235
- /**
236
- * Quick check: does this text contain any extractable signals?
237
- * Cheaper than running full extraction use as a gate.
238
- *
239
- * @param {string} text
240
- * @returns {boolean}
241
- */
242
- export function hasExtractableSignals(text) {
243
- if (!text || text.length < 10) return false;
244
-
245
- for (const pattern of PATTERNS) {
246
- pattern.regex.lastIndex = 0;
247
- if (pattern.regex.test(text)) return true;
248
- }
249
- return false;
250
- }
1
+ /**
2
+ * extractor-heuristic.js — Tier 2: Zero-Cost Regex-Based Fact Extractor
3
+ *
4
+ * Scans raw conversation text for explicit developer preference signals:
5
+ * "I prefer...", "we decided...", "always use...", "stack includes..."
6
+ *
7
+ * Design decisions:
8
+ * - Runs synchronously — zero latency overhead on the hot path
9
+ * - Conservative extraction: high-precision, low-recall
10
+ * - Returns structured facts with confidence scores (0.0 - 1.0)
11
+ * - Deduplication-ready: facts are normalized before output
12
+ *
13
+ * This is NOT the primary extraction tier. It's a lightweight safety net
14
+ * that catches the most obvious signals when Tier 3 (LLM) is unavailable
15
+ * or still processing asynchronously.
16
+ */
17
+
18
+ // ============================================================
19
+ // PATTERN DEFINITIONS
20
+ // Ordered by specificity — most specific patterns first
21
+ // Each pattern has: regex, category, confidence, and a template
22
+ // to normalize the matched text into a clean fact statement.
23
+ // ============================================================
24
+
25
+ const PATTERNS = [
26
+ // --- Decision patterns (highest confidence) ---
27
+ {
28
+ regex: /(?:we|i|the team)\s+(?:have\s+)?decided\s+(?:to\s+)?(?:use|go\s+with|adopt|switch\s+to|move\s+to)\s+(.+?)(?:\.|$)/gi,
29
+ category: 'decision',
30
+ confidence: 0.85,
31
+ template: (match) => `Decision: ${cleanFact(match[1])}`
32
+ },
33
+ {
34
+ regex: /(?:we(?:'re|\s+are)?\s+)?(?:going|moving)\s+(?:to\s+)?(?:use|adopt|switch\s+to|migrate\s+to)\s+(.+?)(?:\s+(?:for|because|since|as)\b|\.|$)/gi,
35
+ category: 'decision',
36
+ confidence: 0.80,
37
+ template: (match) => `Decision: Moving to ${cleanFact(match[1])}`
38
+ },
39
+
40
+ // --- Explicit preference patterns ---
41
+ {
42
+ regex: /i\s+(?:always\s+)?prefer\s+(.+?)(?:\s+(?:over|instead\s+of|rather\s+than)\s+(.+?))?(?:\.|$)/gi,
43
+ category: 'preference',
44
+ confidence: 0.80,
45
+ template: (match) => {
46
+ const pref = cleanFact(match[1]);
47
+ const alt = match[2] ? ` over ${cleanFact(match[2])}` : '';
48
+ return `Preference: ${pref}${alt}`;
49
+ }
50
+ },
51
+ {
52
+ regex: /(?:we|i)\s+(?:should\s+)?(?:always|never)\s+(?:use|avoid|include|add|write|create)\s+(.+?)(?:\.|$)/gi,
53
+ category: 'preference',
54
+ confidence: 0.75,
55
+ template: (match) => `Rule: ${cleanFact(match[0])}`
56
+ },
57
+
58
+ // --- Stack / technology patterns ---
59
+ {
60
+ regex: /(?:our|the|my)\s+(?:tech\s+)?stack\s+(?:includes?|uses?|is|has)\s+(.+?)(?:\.\s|\.$|$)/gim,
61
+ category: 'stack',
62
+ confidence: 0.85,
63
+ template: (match) => `Stack: ${cleanFact(match[1])}`
64
+ },
65
+ {
66
+ regex: /(?:we(?:'re|\s+are)?\s+)?using\s+(.+?)\s+(?:for|as)\s+(?:our|the)\s+(.+?)(?:\.|$)/gi,
67
+ category: 'stack',
68
+ confidence: 0.80,
69
+ template: (match) => `Stack: Using ${cleanFact(match[1])} for ${cleanFact(match[2])}`
70
+ },
71
+ {
72
+ regex: /(?:our|the)\s+(?:backend|frontend|database|api|server|client|infra(?:structure)?)\s+(?:is|uses?|runs?\s+on)\s+(.+?)(?:\.|$)/gi,
73
+ category: 'stack',
74
+ confidence: 0.80,
75
+ template: (match) => `Stack: ${cleanFact(match[0])}`
76
+ },
77
+
78
+ // --- Naming / convention patterns ---
79
+ {
80
+ regex: /(?:name|call|rename)\s+(?:it|this|the\s+\w+)\s+["'`]?(\w[\w\-\.]+)["'`]?/gi,
81
+ category: 'naming',
82
+ confidence: 0.70,
83
+ template: (match) => `Naming: ${cleanFact(match[0])}`
84
+ },
85
+
86
+ // --- Architecture patterns ---
87
+ {
88
+ regex: /(?:the\s+)?(?:project|app|application|system|architecture)\s+(?:follows?|uses?|is\s+based\s+on|implements?)\s+(.+?)(?:\s+pattern|\s+architecture)?(?:\.|$)/gi,
89
+ category: 'architecture',
90
+ confidence: 0.80,
91
+ template: (match) => `Architecture: ${cleanFact(match[1])}`
92
+ },
93
+
94
+ // --- Coding rule / style patterns ---
95
+ {
96
+ regex: /(?:always|never|must|should|don't|do\s+not)\s+(?:use|write|create|add|include|put|place|keep)\s+(.+?)(?:\.|$)/gi,
97
+ category: 'rule',
98
+ confidence: 0.70,
99
+ template: (match) => `Rule: ${cleanFact(match[0])}`
100
+ },
101
+
102
+ // --- Config / env patterns ---
103
+ {
104
+ regex: /(?:set|change|update|configure)\s+(?:the\s+)?(?:port|host|env|environment|config|setting)\s+(?:to|=|:)\s*["'`]?(.+?)["'`]?(?:\.|$)/gi,
105
+ category: 'config',
106
+ confidence: 0.75,
107
+ template: (match) => `Config: ${cleanFact(match[0])}`
108
+ }
109
+ ];
110
+
111
+ // ============================================================
112
+ // NOISE FILTERS
113
+ // Skip lines that look like code, errors, or system output
114
+ // ============================================================
115
+
116
+ const NOISE_PATTERNS = [
117
+ /^[\s]*(?:import|export|const|let|var|function|class|if|else|for|while|return|throw|try|catch)\s/,
118
+ /^[\s]*[{}\[\]();]/,
119
+ /^[\s]*\/\//,
120
+ /^[\s]*\*/,
121
+ /^[\s]*```/,
122
+ /^\s*$/,
123
+ /^(?:error|warning|info|debug|trace):/i,
124
+ /^\s*at\s+\w+/, // stack trace lines
125
+ /^[A-Z_]{2,}=/, // ENV variable assignments
126
+ /^\d{4}-\d{2}-\d{2}/, // timestamp lines
127
+ ];
128
+
129
+ /**
130
+ * Check if a line looks like noise (code, logs, etc.)
131
+ * @param {string} line
132
+ * @returns {boolean}
133
+ */
134
+ function isNoiseLine(line) {
135
+ return NOISE_PATTERNS.some(p => p.test(line));
136
+ }
137
+
138
+ // ============================================================
139
+ // FACT NORMALIZATION & COGNITIVE FILTER
140
+ // ============================================================
141
+
142
+ /**
143
+ * Clean and normalize an extracted fact string.
144
+ * Removes trailing punctuation, excess whitespace, and truncates.
145
+ * @param {string} raw
146
+ * @returns {string}
147
+ */
148
+ function cleanFact(raw) {
149
+ if (!raw) return '';
150
+ return raw
151
+ .trim()
152
+ .replace(/[\s]+/g, ' ') // collapse whitespace
153
+ .replace(/[,;:]+$/, '') // strip trailing punctuation
154
+ .replace(/^["'`]+|["'`]+$/g, '') // strip quotes
155
+ .slice(0, 200); // hard max fact length
156
+ }
157
+
158
+ // List of programming/tech concepts to distinguish tech context from conversational filler
159
+ const TECH_CONCEPTS = [
160
+ 'mode', 'theme', 'config', 'stack', 'style', 'code', 'file', 'folder', 'path',
161
+ 'api', 'endpoint', 'json', 'data', 'db', 'database', 'table', 'migration',
162
+ 'schema', 'sql', 'query', 'url', 'port', 'host', 'env', 'environment',
163
+ 'node', 'npm', 'git', 'react', 'vue', 'angular', 'svelte', 'next', 'express',
164
+ 'postgres', 'sqlite', 'mongo', 'mysql', 'docker', 'ubuntu', 'linux', 'server',
165
+ 'pipeline', 'ci', 'cd', 'github', 'actions', 'oauth', 'auth', 'security',
166
+ 'token', 'key', 'credential', 'package', 'dependency', 'library', 'script',
167
+ 'test', 'jest', 'vitest', 'eslint', 'prettier', 'tailwind', 'css', 'html',
168
+ 'js', 'ts', 'typescript', 'javascript', 'eval', 'function', 'class', 'component',
169
+ 'import', 'export', 'require', 'const', 'let', 'var', 'compiler', 'build',
170
+ 'cli', 'command', 'terminal', 'mcp', 'server', 'client', 'persyst', 'memory'
171
+ ];
172
+
173
+ /**
174
+ * Filter out conversational filler and keep only valid technical statements/preferences.
175
+ * @param {string} content - The extracted fact text
176
+ * @returns {boolean} - true if it is a valid, high-value fact
177
+ */
178
+ function cognitiveNoiseFilter(content) {
179
+ const normalized = content.toLowerCase().trim();
180
+
181
+ // 1. Filter out interrogatives (questions)
182
+ const questionWords = ['how', 'why', 'what', 'where', 'when', 'who', 'can', 'could', 'would', 'is', 'are', 'should'];
183
+ if (normalized.endsWith('?')) return false;
184
+ for (const q of questionWords) {
185
+ if (normalized.startsWith(q + ' ') || normalized.includes(` ${q} `) || normalized.includes(`:${q} `)) {
186
+ if (normalized.includes(' ?') || normalized.endsWith('?')) return false;
187
+ if (/preference:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
188
+ if (/rule:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
189
+ if (/decision:\s+(?:can|could|would|is|are|how|why|what|where)\s/i.test(content)) return false;
190
+ }
191
+ }
192
+
193
+ // 2. Filter out transient pronouns/vague statements without enough context
194
+ if (/preference:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
195
+ if (/decision:\s+(?:this|that|it|these|those|us|me|them|him|her)\b/i.test(content)) return false;
196
+
197
+ // 3. Filter out transient time references indicating very short-term state
198
+ const transientTerms = ['today', 'tomorrow', 'yesterday', 'now', 'just', 'temporary', 'currently', 'for now', 'briefly', 'at the moment'];
199
+ for (const term of transientTerms) {
200
+ if (normalized.includes(` ${term} `) || normalized.endsWith(` ${term}`)) {
201
+ return false;
202
+ }
203
+ }
204
+
205
+ // 4. Filter out trace logs, build outputs, compile errors
206
+ if (normalized.includes('at ') && normalized.includes('.js:')) return false;
207
+ if (normalized.includes('error:') || normalized.includes('exception:')) return false;
208
+ if (normalized.includes('exit code') || normalized.includes('npm error')) return false;
209
+
210
+ // 5. Require at least one programming/project-related concept
211
+ const words = normalized.split(/[^a-zA-Z0-9\-\.\/]+/);
212
+ const hasTechTerm = words.some(w => {
213
+ return TECH_CONCEPTS.some(concept => {
214
+ if (concept.length <= 2) {
215
+ return w === concept;
216
+ }
217
+ return w.includes(concept);
218
+ }) ||
219
+ w.endsWith('.js') || w.endsWith('.json') || w.endsWith('.css') || w.endsWith('.md') ||
220
+ w.includes('/') || w.includes('\\');
221
+ });
222
+
223
+ if (!hasTechTerm) {
224
+ return false;
225
+ }
226
+
227
+ return true;
228
+ }
229
+
230
+ // ============================================================
231
+ // MAIN EXTRACTION FUNCTION
232
+ // ============================================================
233
+
234
+ /**
235
+ * Extract facts from raw conversation text using regex heuristics.
236
+ *
237
+ * @param {string} text - Raw conversation text (user prompt or full turn)
238
+ * @param {Object} [options={}]
239
+ * @param {number} [options.minConfidence=0.65] - Minimum confidence to include a fact
240
+ * @param {number} [options.maxFacts=10] - Maximum facts to extract per call
241
+ * @returns {Array<{content: string, category: string, confidence: number}>}
242
+ *
243
+ * @example
244
+ * const facts = extractHeuristic("I prefer Postgres over SQLite for our backend database.");
245
+ * // => [{ content: "Preference: Postgres over SQLite", category: "preference", confidence: 0.80 }]
246
+ */
247
+ export function extractHeuristic(text, options = {}) {
248
+ const {
249
+ minConfidence = 0.65,
250
+ maxFacts = 10
251
+ } = options;
252
+
253
+ if (!text || typeof text !== 'string' || text.length < 10) {
254
+ return [];
255
+ }
256
+
257
+ const facts = [];
258
+ const seen = new Set(); // dedup by normalized content
259
+
260
+ // Process line-by-line to filter noise
261
+ const lines = text.split('\n');
262
+ const cleanLines = lines.filter(line => !isNoiseLine(line));
263
+ const cleanText = cleanLines.join('\n');
264
+
265
+ for (const pattern of PATTERNS) {
266
+ // Reset regex state for global matching
267
+ pattern.regex.lastIndex = 0;
268
+
269
+ let match;
270
+ while ((match = pattern.regex.exec(cleanText)) !== null) {
271
+ // Skip matches that are too short to be meaningful
272
+ if (match[0].length < 8) continue;
273
+
274
+ try {
275
+ const content = pattern.template(match);
276
+ if (!content || content.length < 5) continue;
277
+
278
+ if (!cognitiveNoiseFilter(content)) continue;
279
+
280
+ // Normalize for dedup
281
+ const key = content.toLowerCase().replace(/\s+/g, ' ').trim();
282
+ if (seen.has(key)) continue;
283
+ seen.add(key);
284
+
285
+ if (pattern.confidence >= minConfidence) {
286
+ facts.push({
287
+ content,
288
+ category: pattern.category,
289
+ confidence: pattern.confidence
290
+ });
291
+ }
292
+
293
+ if (facts.length >= maxFacts) break;
294
+ } catch (_) {
295
+ // Template execution failed — skip this match
296
+ continue;
297
+ }
298
+ }
299
+
300
+ if (facts.length >= maxFacts) break;
301
+ }
302
+
303
+ // Sort by confidence descending
304
+ facts.sort((a, b) => b.confidence - a.confidence);
305
+
306
+ return facts;
307
+ }
308
+
309
+ /**
310
+ * Quick check: does this text contain any extractable signals?
311
+ * Cheaper than running full extraction — use as a gate.
312
+ *
313
+ * @param {string} text
314
+ * @returns {boolean}
315
+ */
316
+ export function hasExtractableSignals(text) {
317
+ if (!text || text.length < 10) return false;
318
+
319
+ for (const pattern of PATTERNS) {
320
+ pattern.regex.lastIndex = 0;
321
+ if (pattern.regex.test(text)) return true;
322
+ }
323
+ return false;
324
+ }
package/src/git.js CHANGED
@@ -86,7 +86,13 @@ export async function getRecentCommits(repoPath, count = 20) {
86
86
  throw new Error(`Not a git repository: ${repoPath}`);
87
87
  }
88
88
  if (message.includes('ENOENT') || message.includes('not recognized')) {
89
- throw new Error('Git is not installed or not in PATH');
89
+ throw new Error(
90
+ 'Git binary not found. Git is required to ingest commits.\n' +
91
+ 'Please install Git and ensure it is added to your system PATH:\n' +
92
+ ' - Windows: Download from https://git-scm.com/download/win\n' +
93
+ ' - macOS: Run `brew install git` or install Xcode Command Line Tools\n' +
94
+ ' - Linux: Run `sudo apt-get install git` or equivalent.'
95
+ );
90
96
  }
91
97
  throw new Error(`Failed to read git log: ${message}`);
92
98
  }