@disco_trooper/apple-notes-mcp 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -24
- package/package.json +10 -8
- package/src/config/claude.test.ts +47 -0
- package/src/config/claude.ts +106 -0
- package/src/config/constants.ts +11 -2
- package/src/config/paths.test.ts +40 -0
- package/src/config/paths.ts +86 -0
- package/src/db/arrow-fix.test.ts +101 -0
- package/src/db/lancedb.test.ts +209 -2
- package/src/db/lancedb.ts +345 -7
- package/src/embeddings/cache.test.ts +150 -0
- package/src/embeddings/cache.ts +204 -0
- package/src/embeddings/index.ts +21 -2
- package/src/embeddings/local.ts +61 -10
- package/src/embeddings/openrouter.ts +233 -11
- package/src/graph/export.test.ts +81 -0
- package/src/graph/export.ts +163 -0
- package/src/graph/extract.test.ts +90 -0
- package/src/graph/extract.ts +52 -0
- package/src/graph/queries.test.ts +156 -0
- package/src/graph/queries.ts +224 -0
- package/src/index.ts +249 -9
- package/src/notes/crud.test.ts +26 -2
- package/src/notes/crud.ts +43 -5
- package/src/notes/read.ts +83 -68
- package/src/search/chunk-indexer.test.ts +353 -0
- package/src/search/chunk-indexer.ts +207 -0
- package/src/search/chunk-search.test.ts +327 -0
- package/src/search/chunk-search.ts +298 -0
- package/src/search/indexer.ts +151 -109
- package/src/setup.ts +46 -67
- package/src/utils/chunker.test.ts +182 -0
- package/src/utils/chunker.ts +170 -0
- package/src/utils/content-filter.test.ts +225 -0
- package/src/utils/content-filter.ts +275 -0
- package/src/utils/runtime.test.ts +70 -0
- package/src/utils/runtime.ts +40 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunker with recursive character splitting that respects natural boundaries.
|
|
3
|
+
* Prioritizes splitting at: paragraphs > sentences > words > characters
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Separators in priority order - prefer splitting at larger boundaries first
|
|
8
|
+
*/
|
|
9
|
+
export const SEPARATORS = [
|
|
10
|
+
"\n\n", // Paragraph
|
|
11
|
+
"\n", // Line
|
|
12
|
+
". ", // Sentence (period)
|
|
13
|
+
"! ", // Sentence (exclamation)
|
|
14
|
+
"? ", // Sentence (question)
|
|
15
|
+
"; ", // Clause
|
|
16
|
+
", ", // Phrase
|
|
17
|
+
" ", // Word
|
|
18
|
+
"", // Character (fallback)
|
|
19
|
+
] as const;
|
|
20
|
+
|
|
21
|
+
export interface ChunkOptions {
|
|
22
|
+
/** Maximum size of each chunk in characters */
|
|
23
|
+
chunkSize: number;
|
|
24
|
+
/** Number of characters to overlap between chunks */
|
|
25
|
+
overlap: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ChunkResult {
|
|
29
|
+
/** The text content of this chunk */
|
|
30
|
+
content: string;
|
|
31
|
+
/** Zero-based index of this chunk */
|
|
32
|
+
index: number;
|
|
33
|
+
/** Total number of chunks */
|
|
34
|
+
totalChunks: number;
|
|
35
|
+
/** Start position in original text */
|
|
36
|
+
startPos: number;
|
|
37
|
+
/** End position in original text (exclusive) */
|
|
38
|
+
endPos: number;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
|
|
42
|
+
chunkSize: 500,
|
|
43
|
+
overlap: 100,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Find the best split point near the target position.
|
|
48
|
+
* Searches for separators in priority order within a reasonable range.
|
|
49
|
+
*
|
|
50
|
+
* @param text - The full text to search in
|
|
51
|
+
* @param target - The target position to split near
|
|
52
|
+
* @returns The best split position (after the separator)
|
|
53
|
+
*/
|
|
54
|
+
export function findSplitPoint(text: string, target: number): number {
|
|
55
|
+
// Search window: look backwards and forwards from target
|
|
56
|
+
const searchWindow = Math.min(50, Math.floor(target / 2));
|
|
57
|
+
const searchStart = Math.max(0, target - searchWindow);
|
|
58
|
+
const searchEnd = Math.min(text.length, target + searchWindow);
|
|
59
|
+
const searchText = text.slice(searchStart, searchEnd);
|
|
60
|
+
|
|
61
|
+
// Try each separator in priority order
|
|
62
|
+
for (const sep of SEPARATORS) {
|
|
63
|
+
if (sep === "") continue; // Skip empty string fallback for now
|
|
64
|
+
|
|
65
|
+
// Find all occurrences of separator in search window
|
|
66
|
+
let bestPos = -1;
|
|
67
|
+
let bestDistance = Infinity;
|
|
68
|
+
|
|
69
|
+
let idx = 0;
|
|
70
|
+
while ((idx = searchText.indexOf(sep, idx)) !== -1) {
|
|
71
|
+
const absolutePos = searchStart + idx + sep.length;
|
|
72
|
+
const distance = Math.abs(absolutePos - target);
|
|
73
|
+
|
|
74
|
+
if (distance < bestDistance) {
|
|
75
|
+
bestDistance = distance;
|
|
76
|
+
bestPos = absolutePos;
|
|
77
|
+
}
|
|
78
|
+
idx += 1;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (bestPos !== -1) {
|
|
82
|
+
return bestPos;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// No separator found, return target as-is
|
|
87
|
+
return target;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Split text into overlapping chunks that respect natural boundaries.
|
|
92
|
+
*
|
|
93
|
+
* @param text - The text to chunk
|
|
94
|
+
* @param options - Chunk size and overlap options
|
|
95
|
+
* @returns Array of chunk results
|
|
96
|
+
*/
|
|
97
|
+
export function chunkText(
|
|
98
|
+
text: string,
|
|
99
|
+
options: ChunkOptions = DEFAULT_CHUNK_OPTIONS
|
|
100
|
+
): ChunkResult[] {
|
|
101
|
+
const { chunkSize, overlap } = options;
|
|
102
|
+
|
|
103
|
+
// Handle empty or whitespace-only text
|
|
104
|
+
const trimmed = text.trim();
|
|
105
|
+
if (trimmed.length === 0) {
|
|
106
|
+
return [];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// If text fits in a single chunk, return it
|
|
110
|
+
if (text.length <= chunkSize) {
|
|
111
|
+
return [
|
|
112
|
+
{
|
|
113
|
+
content: text,
|
|
114
|
+
index: 0,
|
|
115
|
+
totalChunks: 1,
|
|
116
|
+
startPos: 0,
|
|
117
|
+
endPos: text.length,
|
|
118
|
+
},
|
|
119
|
+
];
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const chunks: ChunkResult[] = [];
|
|
123
|
+
let startPos = 0;
|
|
124
|
+
// Minimum step size to ensure progress and avoid tiny chunks
|
|
125
|
+
const minStep = Math.max(1, chunkSize - overlap);
|
|
126
|
+
|
|
127
|
+
while (startPos < text.length) {
|
|
128
|
+
// Calculate target end position
|
|
129
|
+
let endPos = Math.min(startPos + chunkSize, text.length);
|
|
130
|
+
|
|
131
|
+
// If not at the end, find a good split point
|
|
132
|
+
if (endPos < text.length) {
|
|
133
|
+
const splitPoint = findSplitPoint(text, endPos);
|
|
134
|
+
// Only use split point if it creates a reasonably sized chunk
|
|
135
|
+
if (
|
|
136
|
+
splitPoint > startPos + minStep / 2 &&
|
|
137
|
+
splitPoint - startPos <= chunkSize * 1.2
|
|
138
|
+
) {
|
|
139
|
+
endPos = splitPoint;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Extract chunk content
|
|
144
|
+
const content = text.slice(startPos, endPos);
|
|
145
|
+
|
|
146
|
+
chunks.push({
|
|
147
|
+
content,
|
|
148
|
+
index: chunks.length,
|
|
149
|
+
totalChunks: 0, // Will be set after all chunks are created
|
|
150
|
+
startPos,
|
|
151
|
+
endPos,
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
// If we've reached the end, stop
|
|
155
|
+
if (endPos >= text.length) {
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Move to next chunk - ensure minimum step for progress
|
|
160
|
+
startPos = startPos + minStep;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Set totalChunks on all chunks
|
|
164
|
+
const totalChunks = chunks.length;
|
|
165
|
+
for (const chunk of chunks) {
|
|
166
|
+
chunk.totalChunks = totalChunks;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return chunks;
|
|
170
|
+
}
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
calculateEntropy,
|
|
4
|
+
isLikelyBase64,
|
|
5
|
+
getBase64Ratio,
|
|
6
|
+
hasBinaryContent,
|
|
7
|
+
removeBase64Blocks,
|
|
8
|
+
redactSecrets,
|
|
9
|
+
filterContent,
|
|
10
|
+
shouldIndexContent,
|
|
11
|
+
} from "./content-filter.js";
|
|
12
|
+
|
|
13
|
+
describe("content-filter", () => {
|
|
14
|
+
describe("calculateEntropy", () => {
|
|
15
|
+
it("returns 0 for empty string", () => {
|
|
16
|
+
expect(calculateEntropy("")).toBe(0);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it("returns low entropy for repetitive text", () => {
|
|
20
|
+
const entropy = calculateEntropy("aaaaaaaaaa");
|
|
21
|
+
expect(entropy).toBe(0);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it("returns moderate entropy for normal text", () => {
|
|
25
|
+
const entropy = calculateEntropy("Hello, this is normal text.");
|
|
26
|
+
expect(entropy).toBeGreaterThan(2);
|
|
27
|
+
expect(entropy).toBeLessThan(5);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("returns high entropy for Base64 content", () => {
|
|
31
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
|
|
32
|
+
const entropy = calculateEntropy(base64);
|
|
33
|
+
expect(entropy).toBeGreaterThan(4.5);
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe("isLikelyBase64", () => {
|
|
38
|
+
it("returns false for short strings", () => {
|
|
39
|
+
expect(isLikelyBase64("abc123")).toBe(false);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("returns false for normal text", () => {
|
|
43
|
+
expect(isLikelyBase64("This is normal text with spaces and punctuation!")).toBe(false);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it("returns true for Base64 encoded content", () => {
|
|
47
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3MiOiJodHRwczovL2V4YW1wbGUu";
|
|
48
|
+
expect(isLikelyBase64(base64)).toBe(true);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("returns true for URL-safe Base64", () => {
|
|
52
|
+
const urlSafe = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5_abc-def123456";
|
|
53
|
+
expect(isLikelyBase64(urlSafe)).toBe(true);
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
describe("getBase64Ratio", () => {
|
|
58
|
+
it("returns 0 for normal text", () => {
|
|
59
|
+
const ratio = getBase64Ratio("This is completely normal text.");
|
|
60
|
+
expect(ratio).toBe(0);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("returns high ratio for mostly Base64 content", () => {
|
|
64
|
+
// Use actual high-entropy Base64, not repeated chars
|
|
65
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M".repeat(3);
|
|
66
|
+
const content = "Token: " + base64;
|
|
67
|
+
const ratio = getBase64Ratio(content);
|
|
68
|
+
expect(ratio).toBeGreaterThan(0.5);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it("returns partial ratio for mixed content", () => {
|
|
72
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
|
|
73
|
+
const content = `Normal text here. ${base64} More normal text.`;
|
|
74
|
+
const ratio = getBase64Ratio(content);
|
|
75
|
+
expect(ratio).toBeGreaterThan(0);
|
|
76
|
+
expect(ratio).toBeLessThan(0.7);
|
|
77
|
+
});
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
describe("hasBinaryContent", () => {
|
|
81
|
+
it("returns false for normal text", () => {
|
|
82
|
+
expect(hasBinaryContent("Normal text")).toBe(false);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("returns false for text with newlines and tabs", () => {
|
|
86
|
+
expect(hasBinaryContent("Line 1\nLine 2\tTabbed")).toBe(false);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it("returns true for null bytes", () => {
|
|
90
|
+
expect(hasBinaryContent("Text\x00with null")).toBe(true);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it("returns true for control characters", () => {
|
|
94
|
+
expect(hasBinaryContent("Text\x03with control")).toBe(true);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
describe("removeBase64Blocks", () => {
|
|
99
|
+
it("removes Base64 blocks from content", () => {
|
|
100
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
|
|
101
|
+
const content = `API Token: ${base64}\n\nNext section...`;
|
|
102
|
+
const result = removeBase64Blocks(content);
|
|
103
|
+
|
|
104
|
+
expect(result).not.toContain(base64);
|
|
105
|
+
expect(result).toContain("[ENCODED]");
|
|
106
|
+
expect(result).toContain("API Token:");
|
|
107
|
+
expect(result).toContain("Next section...");
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it("preserves normal text", () => {
|
|
111
|
+
const content = "This is completely normal text without any encoding.";
|
|
112
|
+
expect(removeBase64Blocks(content)).toBe(content);
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
describe("redactSecrets", () => {
|
|
117
|
+
it("redacts JWT tokens", () => {
|
|
118
|
+
const jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
|
|
119
|
+
const content = `Bearer ${jwt}`;
|
|
120
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
121
|
+
|
|
122
|
+
expect(redacted).toContain("[JWT_REDACTED]");
|
|
123
|
+
expect(secretsFound).toContain("jwt");
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it("redacts AWS access keys", () => {
|
|
127
|
+
const content = "AWS Key: AKIAIOSFODNN7EXAMPLE";
|
|
128
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
129
|
+
|
|
130
|
+
expect(redacted).toContain("[AWSACCESSKEY_REDACTED]");
|
|
131
|
+
expect(secretsFound).toContain("awsAccessKey");
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it("redacts GitHub tokens", () => {
|
|
135
|
+
const content = "Token: ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
|
|
136
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
137
|
+
|
|
138
|
+
expect(redacted).toContain("[GITHUBTOKEN_REDACTED]");
|
|
139
|
+
expect(secretsFound).toContain("githubToken");
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it("preserves normal text", () => {
|
|
143
|
+
const content = "This is normal text without secrets.";
|
|
144
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
145
|
+
|
|
146
|
+
expect(redacted).toBe(content);
|
|
147
|
+
expect(secretsFound).toHaveLength(0);
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
describe("filterContent", () => {
|
|
152
|
+
it("returns 'index' for clean content", () => {
|
|
153
|
+
// Content must be at least 50 chars (minContentLength default)
|
|
154
|
+
const content = "This is clean, normal content for indexing. It contains enough text to pass the minimum length requirement.";
|
|
155
|
+
const result = filterContent(content);
|
|
156
|
+
|
|
157
|
+
expect(result.action).toBe("index");
|
|
158
|
+
expect(result.cleanedContent).toBe(content);
|
|
159
|
+
expect(result.reasons).toHaveLength(0);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
it("returns 'skip' for binary content", () => {
|
|
163
|
+
const result = filterContent("Text\x00with null bytes");
|
|
164
|
+
|
|
165
|
+
expect(result.action).toBe("skip");
|
|
166
|
+
expect(result.reasons).toContain("Contains binary content");
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it("returns 'skip' for mostly Base64 content", () => {
|
|
170
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5".repeat(10);
|
|
171
|
+
const result = filterContent(base64);
|
|
172
|
+
|
|
173
|
+
expect(result.action).toBe("skip");
|
|
174
|
+
expect(result.reasons[0]).toContain("Base64 encoded");
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it("returns 'filter' for mixed content with Base64", () => {
|
|
178
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
|
|
179
|
+
const content = `This is important text. Token: ${base64}. More important content here that we want to index.`;
|
|
180
|
+
const result = filterContent(content);
|
|
181
|
+
|
|
182
|
+
expect(result.action).toBe("filter");
|
|
183
|
+
expect(result.cleanedContent).toContain("[ENCODED]");
|
|
184
|
+
expect(result.cleanedContent).toContain("This is important text");
|
|
185
|
+
expect(result.reasons.some(r => r.includes("Base64"))).toBe(true);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it("returns 'skip' if content too short after filtering", () => {
|
|
189
|
+
// Short text + Base64 that will be removed, leaving less than 50 chars
|
|
190
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
|
|
191
|
+
const content = `Hi ${base64}`;
|
|
192
|
+
const result = filterContent(content);
|
|
193
|
+
|
|
194
|
+
expect(result.action).toBe("skip");
|
|
195
|
+
// After removing Base64, only "Hi [ENCODED]" remains which is too short
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
it("respects custom configuration", () => {
|
|
199
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
|
|
200
|
+
// Need enough remaining content after potential filtering
|
|
201
|
+
const content = `This is some text before the token. Token: ${base64}. And this is some text after the token that should remain.`;
|
|
202
|
+
|
|
203
|
+
// With removeBase64 disabled, the Base64 should stay
|
|
204
|
+
const result = filterContent(content, { removeBase64: false });
|
|
205
|
+
|
|
206
|
+
expect(result.action).not.toBe("skip");
|
|
207
|
+
expect(result.cleanedContent).toContain(base64);
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
describe("shouldIndexContent", () => {
|
|
212
|
+
it("returns true for normal content", () => {
|
|
213
|
+
expect(shouldIndexContent("Normal text content")).toBe(true);
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
it("returns false for binary content", () => {
|
|
217
|
+
expect(shouldIndexContent("Binary\x00content")).toBe(false);
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
it("returns false for mostly Base64", () => {
|
|
221
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5".repeat(10);
|
|
222
|
+
expect(shouldIndexContent(base64)).toBe(false);
|
|
223
|
+
});
|
|
224
|
+
});
|
|
225
|
+
});
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content quality filter for RAG indexing.
|
|
3
|
+
* Detects and filters Base64-encoded, binary, and secret content.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { createDebugLogger } from "./debug.js";
|
|
7
|
+
|
|
8
|
+
const debug = createDebugLogger("CONTENT_FILTER");
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Result of content filtering.
|
|
12
|
+
*/
|
|
13
|
+
export interface FilterResult {
|
|
14
|
+
/** Whether to index this content */
|
|
15
|
+
action: "index" | "filter" | "skip";
|
|
16
|
+
/** Cleaned content (if action is "index" or "filter") */
|
|
17
|
+
cleanedContent?: string;
|
|
18
|
+
/** Reasons for filtering/skipping */
|
|
19
|
+
reasons: string[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Calculate Shannon entropy of a string.
|
|
24
|
+
* Higher entropy = more random/encoded content.
|
|
25
|
+
*
|
|
26
|
+
* Typical values:
|
|
27
|
+
* - Normal text: 0.8 - 4.5
|
|
28
|
+
* - Base64: 5.0 - 6.0
|
|
29
|
+
* - Encrypted: 6.0+
|
|
30
|
+
*
|
|
31
|
+
* @param str - String to analyze
|
|
32
|
+
* @returns Entropy value (0-8)
|
|
33
|
+
*/
|
|
34
|
+
export function calculateEntropy(str: string): number {
|
|
35
|
+
if (!str || str.length === 0) return 0;
|
|
36
|
+
|
|
37
|
+
const freq = new Map<string, number>();
|
|
38
|
+
for (const char of str) {
|
|
39
|
+
freq.set(char, (freq.get(char) || 0) + 1);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
let entropy = 0;
|
|
43
|
+
const len = str.length;
|
|
44
|
+
for (const count of freq.values()) {
|
|
45
|
+
const p = count / len;
|
|
46
|
+
entropy -= p * Math.log2(p);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return entropy;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Regex pattern for Base64 content (40+ chars).
|
|
54
|
+
*/
|
|
55
|
+
const BASE64_PATTERN = /[A-Za-z0-9+/]{40,}={0,2}/g;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Regex pattern for URL-safe Base64.
|
|
59
|
+
*/
|
|
60
|
+
const BASE64_URL_SAFE_PATTERN = /[A-Za-z0-9_-]{40,}={0,2}/g;
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Patterns for common secrets/tokens.
|
|
64
|
+
*/
|
|
65
|
+
const SECRET_PATTERNS: Record<string, RegExp> = {
|
|
66
|
+
// Private Keys
|
|
67
|
+
privateKey: /-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY(?: BLOCK)?-----/,
|
|
68
|
+
|
|
69
|
+
// JWT tokens
|
|
70
|
+
jwt: /eyJ[A-Za-z0-9-_=]+\.[A-Za-z0-9-_=]+\.?[A-Za-z0-9-_.+/=]*/g,
|
|
71
|
+
|
|
72
|
+
// AWS
|
|
73
|
+
awsAccessKey: /AKIA[0-9A-Z]{16}/g,
|
|
74
|
+
|
|
75
|
+
// GitHub
|
|
76
|
+
githubToken: /ghp_[a-zA-Z0-9]{36}/g,
|
|
77
|
+
githubFineGrained: /github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}/g,
|
|
78
|
+
|
|
79
|
+
// Slack
|
|
80
|
+
slackToken: /xox[baprs]-[0-9a-zA-Z]{10,48}/g,
|
|
81
|
+
|
|
82
|
+
// Stripe
|
|
83
|
+
stripeKey: /sk_live_[0-9a-zA-Z]{24}/g,
|
|
84
|
+
|
|
85
|
+
// Database URIs with credentials
|
|
86
|
+
dbUri: /(?:mongodb|postgres(?:ql)?|mysql|redis):\/\/[^\s'"]+:[^\s'"]+@[^\s'"]+/g,
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Check if a string segment is likely Base64 encoded.
|
|
91
|
+
*/
|
|
92
|
+
export function isLikelyBase64(str: string): boolean {
|
|
93
|
+
// Minimum length check
|
|
94
|
+
if (str.length < 40) return false;
|
|
95
|
+
|
|
96
|
+
// Check if only Base64 characters
|
|
97
|
+
if (!/^[A-Za-z0-9+/=_-]+$/.test(str)) return false;
|
|
98
|
+
|
|
99
|
+
// Check entropy - Base64 typically has high entropy
|
|
100
|
+
const entropy = calculateEntropy(str);
|
|
101
|
+
return entropy > 4.5;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Calculate the ratio of Base64-like content in a string.
|
|
106
|
+
*/
|
|
107
|
+
export function getBase64Ratio(content: string): number {
|
|
108
|
+
const matches = content.match(BASE64_PATTERN) || [];
|
|
109
|
+
const urlSafeMatches = content.match(BASE64_URL_SAFE_PATTERN) || [];
|
|
110
|
+
|
|
111
|
+
// Combine and deduplicate
|
|
112
|
+
const allMatches = new Set([...matches, ...urlSafeMatches]);
|
|
113
|
+
|
|
114
|
+
let totalBase64Length = 0;
|
|
115
|
+
for (const match of allMatches) {
|
|
116
|
+
if (isLikelyBase64(match)) {
|
|
117
|
+
totalBase64Length += match.length;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return content.length > 0 ? totalBase64Length / content.length : 0;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Check if content contains binary/control characters.
|
|
126
|
+
*/
|
|
127
|
+
export function hasBinaryContent(content: string): boolean {
|
|
128
|
+
// Check for null bytes or control characters (except newlines/tabs)
|
|
129
|
+
return /[\x00-\x08\x0B\x0C\x0E-\x1F]/.test(content);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Remove Base64 blocks from content.
|
|
134
|
+
*/
|
|
135
|
+
export function removeBase64Blocks(content: string): string {
|
|
136
|
+
let result = content;
|
|
137
|
+
|
|
138
|
+
// Remove standard Base64
|
|
139
|
+
result = result.replace(BASE64_PATTERN, (match) => {
|
|
140
|
+
if (isLikelyBase64(match)) {
|
|
141
|
+
return "[ENCODED]";
|
|
142
|
+
}
|
|
143
|
+
return match;
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
// Remove URL-safe Base64
|
|
147
|
+
result = result.replace(BASE64_URL_SAFE_PATTERN, (match) => {
|
|
148
|
+
if (isLikelyBase64(match)) {
|
|
149
|
+
return "[ENCODED]";
|
|
150
|
+
}
|
|
151
|
+
return match;
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
return result;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Redact detected secrets in content.
|
|
159
|
+
*/
|
|
160
|
+
export function redactSecrets(content: string): { content: string; secretsFound: string[] } {
|
|
161
|
+
let result = content;
|
|
162
|
+
const secretsFound: string[] = [];
|
|
163
|
+
|
|
164
|
+
for (const [name, pattern] of Object.entries(SECRET_PATTERNS)) {
|
|
165
|
+
if (pattern.test(result)) {
|
|
166
|
+
// Reset lastIndex for global patterns
|
|
167
|
+
pattern.lastIndex = 0;
|
|
168
|
+
result = result.replace(pattern, `[${name.toUpperCase()}_REDACTED]`);
|
|
169
|
+
secretsFound.push(name);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return { content: result, secretsFound };
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Configuration for content filtering.
|
|
178
|
+
*/
|
|
179
|
+
export interface FilterConfig {
|
|
180
|
+
/** Maximum Base64 ratio before skipping (default: 0.5) */
|
|
181
|
+
maxBase64Ratio?: number;
|
|
182
|
+
/** Minimum meaningful content length after filtering (default: 50) */
|
|
183
|
+
minContentLength?: number;
|
|
184
|
+
/** Whether to redact secrets (default: true) */
|
|
185
|
+
redactSecrets?: boolean;
|
|
186
|
+
/** Whether to remove Base64 blocks (default: true) */
|
|
187
|
+
removeBase64?: boolean;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const DEFAULT_CONFIG: Required<FilterConfig> = {
|
|
191
|
+
maxBase64Ratio: 0.5,
|
|
192
|
+
minContentLength: 50,
|
|
193
|
+
redactSecrets: true,
|
|
194
|
+
removeBase64: true,
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Filter content for RAG indexing.
|
|
199
|
+
*
|
|
200
|
+
* @param content - Raw content to filter
|
|
201
|
+
* @param config - Filter configuration
|
|
202
|
+
* @returns Filter result with action and cleaned content
|
|
203
|
+
*/
|
|
204
|
+
export function filterContent(
|
|
205
|
+
content: string,
|
|
206
|
+
config: FilterConfig = {}
|
|
207
|
+
): FilterResult {
|
|
208
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
209
|
+
const reasons: string[] = [];
|
|
210
|
+
|
|
211
|
+
// 1. Check for binary content - skip entirely
|
|
212
|
+
if (hasBinaryContent(content)) {
|
|
213
|
+
debug("Skipping content with binary characters");
|
|
214
|
+
return { action: "skip", reasons: ["Contains binary content"] };
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// 2. Calculate Base64 ratio
|
|
218
|
+
const base64Ratio = getBase64Ratio(content);
|
|
219
|
+
debug(`Base64 ratio: ${(base64Ratio * 100).toFixed(1)}%`);
|
|
220
|
+
|
|
221
|
+
// Skip if too much encoded content
|
|
222
|
+
if (base64Ratio > cfg.maxBase64Ratio) {
|
|
223
|
+
debug(`Skipping content: ${(base64Ratio * 100).toFixed(1)}% Base64`);
|
|
224
|
+
return {
|
|
225
|
+
action: "skip",
|
|
226
|
+
reasons: [`${(base64Ratio * 100).toFixed(1)}% is Base64 encoded (threshold: ${(cfg.maxBase64Ratio * 100).toFixed(0)}%)`],
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
let cleanedContent = content;
|
|
231
|
+
|
|
232
|
+
// 3. Remove Base64 blocks if present and configured
|
|
233
|
+
if (cfg.removeBase64 && base64Ratio > 0.1) {
|
|
234
|
+
cleanedContent = removeBase64Blocks(cleanedContent);
|
|
235
|
+
reasons.push("Removed Base64 blocks");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// 4. Redact secrets if configured
|
|
239
|
+
if (cfg.redactSecrets) {
|
|
240
|
+
const { content: redacted, secretsFound } = redactSecrets(cleanedContent);
|
|
241
|
+
if (secretsFound.length > 0) {
|
|
242
|
+
cleanedContent = redacted;
|
|
243
|
+
reasons.push(`Redacted secrets: ${secretsFound.join(", ")}`);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// 5. Check if remaining content is meaningful
|
|
248
|
+
const meaningfulContent = cleanedContent
|
|
249
|
+
.replace(/\[.*?_REDACTED\]|\[ENCODED\]/g, "")
|
|
250
|
+
.trim();
|
|
251
|
+
|
|
252
|
+
if (meaningfulContent.length < cfg.minContentLength) {
|
|
253
|
+
debug(`Skipping: insufficient content after filtering (${meaningfulContent.length} chars)`);
|
|
254
|
+
return {
|
|
255
|
+
action: "skip",
|
|
256
|
+
reasons: ["Insufficient meaningful content after filtering"],
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Determine action
|
|
261
|
+
const action = reasons.length > 0 ? "filter" : "index";
|
|
262
|
+
|
|
263
|
+
return { action, cleanedContent, reasons };
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Quick check if content should be indexed.
|
|
268
|
+
* Use this for fast pre-filtering before chunking.
|
|
269
|
+
*/
|
|
270
|
+
export function shouldIndexContent(content: string): boolean {
|
|
271
|
+
// Quick checks
|
|
272
|
+
if (hasBinaryContent(content)) return false;
|
|
273
|
+
if (getBase64Ratio(content) > 0.5) return false;
|
|
274
|
+
return true;
|
|
275
|
+
}
|