@disco_trooper/apple-notes-mcp 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -24
- package/package.json +11 -12
- package/src/config/claude.test.ts +47 -0
- package/src/config/claude.ts +106 -0
- package/src/config/constants.ts +11 -2
- package/src/config/paths.test.ts +40 -0
- package/src/config/paths.ts +86 -0
- package/src/db/arrow-fix.test.ts +101 -0
- package/src/db/lancedb.test.ts +254 -2
- package/src/db/lancedb.ts +385 -38
- package/src/embeddings/cache.test.ts +150 -0
- package/src/embeddings/cache.ts +204 -0
- package/src/embeddings/index.ts +22 -4
- package/src/embeddings/local.ts +57 -17
- package/src/embeddings/openrouter.ts +233 -11
- package/src/errors/index.test.ts +64 -0
- package/src/errors/index.ts +62 -0
- package/src/graph/export.test.ts +81 -0
- package/src/graph/export.ts +163 -0
- package/src/graph/extract.test.ts +90 -0
- package/src/graph/extract.ts +52 -0
- package/src/graph/queries.test.ts +156 -0
- package/src/graph/queries.ts +224 -0
- package/src/index.ts +309 -23
- package/src/notes/conversion.ts +62 -0
- package/src/notes/crud.test.ts +41 -8
- package/src/notes/crud.ts +75 -64
- package/src/notes/read.test.ts +58 -3
- package/src/notes/read.ts +142 -210
- package/src/notes/resolve.ts +174 -0
- package/src/notes/tables.ts +69 -40
- package/src/search/chunk-indexer.test.ts +353 -0
- package/src/search/chunk-indexer.ts +207 -0
- package/src/search/chunk-search.test.ts +327 -0
- package/src/search/chunk-search.ts +298 -0
- package/src/search/index.ts +4 -6
- package/src/search/indexer.ts +164 -109
- package/src/setup.ts +46 -67
- package/src/types/index.ts +4 -0
- package/src/utils/chunker.test.ts +182 -0
- package/src/utils/chunker.ts +170 -0
- package/src/utils/content-filter.test.ts +225 -0
- package/src/utils/content-filter.ts +275 -0
- package/src/utils/debug.ts +0 -2
- package/src/utils/runtime.test.ts +70 -0
- package/src/utils/runtime.ts +40 -0
- package/src/utils/text.test.ts +32 -0
- package/CLAUDE.md +0 -56
- package/src/server.ts +0 -427
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
chunkText,
|
|
4
|
+
type ChunkOptions,
|
|
5
|
+
DEFAULT_CHUNK_OPTIONS,
|
|
6
|
+
SEPARATORS,
|
|
7
|
+
findSplitPoint,
|
|
8
|
+
} from "./chunker.js";
|
|
9
|
+
|
|
10
|
+
describe("chunker", () => {
|
|
11
|
+
describe("exports", () => {
|
|
12
|
+
it("exports SEPARATORS array with correct order", () => {
|
|
13
|
+
expect(SEPARATORS).toEqual([
|
|
14
|
+
"\n\n",
|
|
15
|
+
"\n",
|
|
16
|
+
". ",
|
|
17
|
+
"! ",
|
|
18
|
+
"? ",
|
|
19
|
+
"; ",
|
|
20
|
+
", ",
|
|
21
|
+
" ",
|
|
22
|
+
"",
|
|
23
|
+
]);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("exports DEFAULT_CHUNK_OPTIONS with correct values", () => {
|
|
27
|
+
expect(DEFAULT_CHUNK_OPTIONS).toEqual({
|
|
28
|
+
chunkSize: 500,
|
|
29
|
+
overlap: 100,
|
|
30
|
+
});
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
describe("findSplitPoint", () => {
|
|
35
|
+
it("finds paragraph boundary near target", () => {
|
|
36
|
+
const text = "First paragraph.\n\nSecond paragraph.";
|
|
37
|
+
const target = 20;
|
|
38
|
+
const result = findSplitPoint(text, target);
|
|
39
|
+
// Should find the \n\n at position 16
|
|
40
|
+
expect(result).toBe(18); // After \n\n
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("falls back to sentence boundary", () => {
|
|
44
|
+
const text = "First sentence. Second sentence.";
|
|
45
|
+
const target = 18;
|
|
46
|
+
const result = findSplitPoint(text, target);
|
|
47
|
+
// Should find ". " at position 14-16
|
|
48
|
+
expect(result).toBe(16); // After ". "
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("falls back to word boundary", () => {
|
|
52
|
+
const text = "oneword anotherword";
|
|
53
|
+
const target = 10;
|
|
54
|
+
const result = findSplitPoint(text, target);
|
|
55
|
+
// Should find space at position 7
|
|
56
|
+
expect(result).toBe(8); // After " "
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it("returns target when no separator found", () => {
|
|
60
|
+
const text = "noseparatorshere";
|
|
61
|
+
const target = 8;
|
|
62
|
+
const result = findSplitPoint(text, target);
|
|
63
|
+
expect(result).toBe(8);
|
|
64
|
+
});
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
describe("chunkText", () => {
|
|
68
|
+
it("returns single chunk for short text", () => {
|
|
69
|
+
const text = "Short text";
|
|
70
|
+
const options: ChunkOptions = { chunkSize: 100, overlap: 20 };
|
|
71
|
+
|
|
72
|
+
const result = chunkText(text, options);
|
|
73
|
+
|
|
74
|
+
expect(result).toHaveLength(1);
|
|
75
|
+
expect(result[0]).toEqual({
|
|
76
|
+
content: "Short text",
|
|
77
|
+
index: 0,
|
|
78
|
+
totalChunks: 1,
|
|
79
|
+
startPos: 0,
|
|
80
|
+
endPos: 10,
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("creates multiple chunks for long text", () => {
|
|
85
|
+
const text = "Word ".repeat(50).trim(); // 249 chars
|
|
86
|
+
const options: ChunkOptions = { chunkSize: 50, overlap: 10 };
|
|
87
|
+
|
|
88
|
+
const result = chunkText(text, options);
|
|
89
|
+
|
|
90
|
+
expect(result.length).toBeGreaterThan(1);
|
|
91
|
+
// Each chunk should have content
|
|
92
|
+
result.forEach((chunk) => {
|
|
93
|
+
expect(chunk.content.length).toBeGreaterThan(0);
|
|
94
|
+
expect(chunk.content.length).toBeLessThanOrEqual(options.chunkSize);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
it("includes overlap between chunks", () => {
|
|
99
|
+
const text = "First part. Second part. Third part. Fourth part.";
|
|
100
|
+
const options: ChunkOptions = { chunkSize: 25, overlap: 10 };
|
|
101
|
+
|
|
102
|
+
const result = chunkText(text, options);
|
|
103
|
+
|
|
104
|
+
// Check that chunks overlap - endPos of chunk N should be > startPos of chunk N+1
|
|
105
|
+
for (let i = 0; i < result.length - 1; i++) {
|
|
106
|
+
const currentChunk = result[i];
|
|
107
|
+
const nextChunk = result[i + 1];
|
|
108
|
+
// Overlap means next chunk starts before current chunk ends
|
|
109
|
+
expect(nextChunk.startPos).toBeLessThan(currentChunk.endPos);
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it("respects paragraph boundaries when splitting", () => {
|
|
114
|
+
const text = "First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph.";
|
|
115
|
+
const options: ChunkOptions = { chunkSize: 30, overlap: 5 };
|
|
116
|
+
|
|
117
|
+
const result = chunkText(text, options);
|
|
118
|
+
|
|
119
|
+
// At least one chunk should end at a paragraph boundary
|
|
120
|
+
const hasParaBoundary = result.some((chunk) => {
|
|
121
|
+
const endContent = text.slice(chunk.startPos, chunk.endPos);
|
|
122
|
+
return endContent.endsWith("\n\n") || chunk.endPos === text.length;
|
|
123
|
+
});
|
|
124
|
+
expect(hasParaBoundary).toBe(true);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it("sets correct totalChunks on all chunks", () => {
|
|
128
|
+
const text = "A ".repeat(100).trim(); // Create text that will be chunked
|
|
129
|
+
const options: ChunkOptions = { chunkSize: 20, overlap: 5 };
|
|
130
|
+
|
|
131
|
+
const result = chunkText(text, options);
|
|
132
|
+
|
|
133
|
+
const expectedTotal = result.length;
|
|
134
|
+
result.forEach((chunk, idx) => {
|
|
135
|
+
expect(chunk.totalChunks).toBe(expectedTotal);
|
|
136
|
+
expect(chunk.index).toBe(idx);
|
|
137
|
+
});
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it("handles empty text", () => {
|
|
141
|
+
const result = chunkText("", { chunkSize: 100, overlap: 20 });
|
|
142
|
+
|
|
143
|
+
expect(result).toHaveLength(0);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
it("handles whitespace-only text", () => {
|
|
147
|
+
const result = chunkText(" \n\n ", { chunkSize: 100, overlap: 20 });
|
|
148
|
+
|
|
149
|
+
expect(result).toHaveLength(0);
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
it("uses default options when not provided", () => {
|
|
153
|
+
const text = "Test";
|
|
154
|
+
const result = chunkText(text);
|
|
155
|
+
|
|
156
|
+
expect(result).toHaveLength(1);
|
|
157
|
+
expect(result[0].content).toBe("Test");
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it("covers all original text with chunks", () => {
|
|
161
|
+
const text = "The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.";
|
|
162
|
+
const options: ChunkOptions = { chunkSize: 30, overlap: 10 };
|
|
163
|
+
|
|
164
|
+
const result = chunkText(text, options);
|
|
165
|
+
|
|
166
|
+
// Verify chunks cover the entire text
|
|
167
|
+
expect(result[0].startPos).toBe(0);
|
|
168
|
+
expect(result[result.length - 1].endPos).toBe(text.length);
|
|
169
|
+
|
|
170
|
+
// Verify each chunk's content matches its position in original text
|
|
171
|
+
for (const chunk of result) {
|
|
172
|
+
expect(chunk.content).toBe(text.slice(chunk.startPos, chunk.endPos));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Verify chunks are contiguous (no gaps)
|
|
176
|
+
for (let i = 0; i < result.length - 1; i++) {
|
|
177
|
+
// Next chunk should start before or at current chunk's end (overlap)
|
|
178
|
+
expect(result[i + 1].startPos).toBeLessThanOrEqual(result[i].endPos);
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
});
|
|
182
|
+
});
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunker with recursive character splitting that respects natural boundaries.
|
|
3
|
+
* Prioritizes splitting at: paragraphs > sentences > words > characters
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Separators in priority order - prefer splitting at larger boundaries first
|
|
8
|
+
*/
|
|
9
|
+
export const SEPARATORS = [
|
|
10
|
+
"\n\n", // Paragraph
|
|
11
|
+
"\n", // Line
|
|
12
|
+
". ", // Sentence (period)
|
|
13
|
+
"! ", // Sentence (exclamation)
|
|
14
|
+
"? ", // Sentence (question)
|
|
15
|
+
"; ", // Clause
|
|
16
|
+
", ", // Phrase
|
|
17
|
+
" ", // Word
|
|
18
|
+
"", // Character (fallback)
|
|
19
|
+
] as const;
|
|
20
|
+
|
|
21
|
+
export interface ChunkOptions {
|
|
22
|
+
/** Maximum size of each chunk in characters */
|
|
23
|
+
chunkSize: number;
|
|
24
|
+
/** Number of characters to overlap between chunks */
|
|
25
|
+
overlap: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ChunkResult {
|
|
29
|
+
/** The text content of this chunk */
|
|
30
|
+
content: string;
|
|
31
|
+
/** Zero-based index of this chunk */
|
|
32
|
+
index: number;
|
|
33
|
+
/** Total number of chunks */
|
|
34
|
+
totalChunks: number;
|
|
35
|
+
/** Start position in original text */
|
|
36
|
+
startPos: number;
|
|
37
|
+
/** End position in original text (exclusive) */
|
|
38
|
+
endPos: number;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
|
|
42
|
+
chunkSize: 500,
|
|
43
|
+
overlap: 100,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Find the best split point near the target position.
|
|
48
|
+
* Searches for separators in priority order within a reasonable range.
|
|
49
|
+
*
|
|
50
|
+
* @param text - The full text to search in
|
|
51
|
+
* @param target - The target position to split near
|
|
52
|
+
* @returns The best split position (after the separator)
|
|
53
|
+
*/
|
|
54
|
+
export function findSplitPoint(text: string, target: number): number {
|
|
55
|
+
// Search window: look backwards and forwards from target
|
|
56
|
+
const searchWindow = Math.min(50, Math.floor(target / 2));
|
|
57
|
+
const searchStart = Math.max(0, target - searchWindow);
|
|
58
|
+
const searchEnd = Math.min(text.length, target + searchWindow);
|
|
59
|
+
const searchText = text.slice(searchStart, searchEnd);
|
|
60
|
+
|
|
61
|
+
// Try each separator in priority order
|
|
62
|
+
for (const sep of SEPARATORS) {
|
|
63
|
+
if (sep === "") continue; // Skip empty string fallback for now
|
|
64
|
+
|
|
65
|
+
// Find all occurrences of separator in search window
|
|
66
|
+
let bestPos = -1;
|
|
67
|
+
let bestDistance = Infinity;
|
|
68
|
+
|
|
69
|
+
let idx = 0;
|
|
70
|
+
while ((idx = searchText.indexOf(sep, idx)) !== -1) {
|
|
71
|
+
const absolutePos = searchStart + idx + sep.length;
|
|
72
|
+
const distance = Math.abs(absolutePos - target);
|
|
73
|
+
|
|
74
|
+
if (distance < bestDistance) {
|
|
75
|
+
bestDistance = distance;
|
|
76
|
+
bestPos = absolutePos;
|
|
77
|
+
}
|
|
78
|
+
idx += 1;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (bestPos !== -1) {
|
|
82
|
+
return bestPos;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// No separator found, return target as-is
|
|
87
|
+
return target;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Split text into overlapping chunks that respect natural boundaries.
|
|
92
|
+
*
|
|
93
|
+
* @param text - The text to chunk
|
|
94
|
+
* @param options - Chunk size and overlap options
|
|
95
|
+
* @returns Array of chunk results
|
|
96
|
+
*/
|
|
97
|
+
export function chunkText(
|
|
98
|
+
text: string,
|
|
99
|
+
options: ChunkOptions = DEFAULT_CHUNK_OPTIONS
|
|
100
|
+
): ChunkResult[] {
|
|
101
|
+
const { chunkSize, overlap } = options;
|
|
102
|
+
|
|
103
|
+
// Handle empty or whitespace-only text
|
|
104
|
+
const trimmed = text.trim();
|
|
105
|
+
if (trimmed.length === 0) {
|
|
106
|
+
return [];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// If text fits in a single chunk, return it
|
|
110
|
+
if (text.length <= chunkSize) {
|
|
111
|
+
return [
|
|
112
|
+
{
|
|
113
|
+
content: text,
|
|
114
|
+
index: 0,
|
|
115
|
+
totalChunks: 1,
|
|
116
|
+
startPos: 0,
|
|
117
|
+
endPos: text.length,
|
|
118
|
+
},
|
|
119
|
+
];
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const chunks: ChunkResult[] = [];
|
|
123
|
+
let startPos = 0;
|
|
124
|
+
// Minimum step size to ensure progress and avoid tiny chunks
|
|
125
|
+
const minStep = Math.max(1, chunkSize - overlap);
|
|
126
|
+
|
|
127
|
+
while (startPos < text.length) {
|
|
128
|
+
// Calculate target end position
|
|
129
|
+
let endPos = Math.min(startPos + chunkSize, text.length);
|
|
130
|
+
|
|
131
|
+
// If not at the end, find a good split point
|
|
132
|
+
if (endPos < text.length) {
|
|
133
|
+
const splitPoint = findSplitPoint(text, endPos);
|
|
134
|
+
// Only use split point if it creates a reasonably sized chunk
|
|
135
|
+
if (
|
|
136
|
+
splitPoint > startPos + minStep / 2 &&
|
|
137
|
+
splitPoint - startPos <= chunkSize * 1.2
|
|
138
|
+
) {
|
|
139
|
+
endPos = splitPoint;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Extract chunk content
|
|
144
|
+
const content = text.slice(startPos, endPos);
|
|
145
|
+
|
|
146
|
+
chunks.push({
|
|
147
|
+
content,
|
|
148
|
+
index: chunks.length,
|
|
149
|
+
totalChunks: 0, // Will be set after all chunks are created
|
|
150
|
+
startPos,
|
|
151
|
+
endPos,
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
// If we've reached the end, stop
|
|
155
|
+
if (endPos >= text.length) {
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Move to next chunk - ensure minimum step for progress
|
|
160
|
+
startPos = startPos + minStep;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Set totalChunks on all chunks
|
|
164
|
+
const totalChunks = chunks.length;
|
|
165
|
+
for (const chunk of chunks) {
|
|
166
|
+
chunk.totalChunks = totalChunks;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return chunks;
|
|
170
|
+
}
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
calculateEntropy,
|
|
4
|
+
isLikelyBase64,
|
|
5
|
+
getBase64Ratio,
|
|
6
|
+
hasBinaryContent,
|
|
7
|
+
removeBase64Blocks,
|
|
8
|
+
redactSecrets,
|
|
9
|
+
filterContent,
|
|
10
|
+
shouldIndexContent,
|
|
11
|
+
} from "./content-filter.js";
|
|
12
|
+
|
|
13
|
+
describe("content-filter", () => {
|
|
14
|
+
describe("calculateEntropy", () => {
|
|
15
|
+
it("returns 0 for empty string", () => {
|
|
16
|
+
expect(calculateEntropy("")).toBe(0);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it("returns low entropy for repetitive text", () => {
|
|
20
|
+
const entropy = calculateEntropy("aaaaaaaaaa");
|
|
21
|
+
expect(entropy).toBe(0);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it("returns moderate entropy for normal text", () => {
|
|
25
|
+
const entropy = calculateEntropy("Hello, this is normal text.");
|
|
26
|
+
expect(entropy).toBeGreaterThan(2);
|
|
27
|
+
expect(entropy).toBeLessThan(5);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("returns high entropy for Base64 content", () => {
|
|
31
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
|
|
32
|
+
const entropy = calculateEntropy(base64);
|
|
33
|
+
expect(entropy).toBeGreaterThan(4.5);
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe("isLikelyBase64", () => {
|
|
38
|
+
it("returns false for short strings", () => {
|
|
39
|
+
expect(isLikelyBase64("abc123")).toBe(false);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("returns false for normal text", () => {
|
|
43
|
+
expect(isLikelyBase64("This is normal text with spaces and punctuation!")).toBe(false);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it("returns true for Base64 encoded content", () => {
|
|
47
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3MiOiJodHRwczovL2V4YW1wbGUu";
|
|
48
|
+
expect(isLikelyBase64(base64)).toBe(true);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("returns true for URL-safe Base64", () => {
|
|
52
|
+
const urlSafe = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5_abc-def123456";
|
|
53
|
+
expect(isLikelyBase64(urlSafe)).toBe(true);
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
describe("getBase64Ratio", () => {
|
|
58
|
+
it("returns 0 for normal text", () => {
|
|
59
|
+
const ratio = getBase64Ratio("This is completely normal text.");
|
|
60
|
+
expect(ratio).toBe(0);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("returns high ratio for mostly Base64 content", () => {
|
|
64
|
+
// Use actual high-entropy Base64, not repeated chars
|
|
65
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M".repeat(3);
|
|
66
|
+
const content = "Token: " + base64;
|
|
67
|
+
const ratio = getBase64Ratio(content);
|
|
68
|
+
expect(ratio).toBeGreaterThan(0.5);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it("returns partial ratio for mixed content", () => {
|
|
72
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
|
|
73
|
+
const content = `Normal text here. ${base64} More normal text.`;
|
|
74
|
+
const ratio = getBase64Ratio(content);
|
|
75
|
+
expect(ratio).toBeGreaterThan(0);
|
|
76
|
+
expect(ratio).toBeLessThan(0.7);
|
|
77
|
+
});
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
describe("hasBinaryContent", () => {
|
|
81
|
+
it("returns false for normal text", () => {
|
|
82
|
+
expect(hasBinaryContent("Normal text")).toBe(false);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("returns false for text with newlines and tabs", () => {
|
|
86
|
+
expect(hasBinaryContent("Line 1\nLine 2\tTabbed")).toBe(false);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it("returns true for null bytes", () => {
|
|
90
|
+
expect(hasBinaryContent("Text\x00with null")).toBe(true);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it("returns true for control characters", () => {
|
|
94
|
+
expect(hasBinaryContent("Text\x03with control")).toBe(true);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
describe("removeBase64Blocks", () => {
|
|
99
|
+
it("removes Base64 blocks from content", () => {
|
|
100
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
|
|
101
|
+
const content = `API Token: ${base64}\n\nNext section...`;
|
|
102
|
+
const result = removeBase64Blocks(content);
|
|
103
|
+
|
|
104
|
+
expect(result).not.toContain(base64);
|
|
105
|
+
expect(result).toContain("[ENCODED]");
|
|
106
|
+
expect(result).toContain("API Token:");
|
|
107
|
+
expect(result).toContain("Next section...");
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it("preserves normal text", () => {
|
|
111
|
+
const content = "This is completely normal text without any encoding.";
|
|
112
|
+
expect(removeBase64Blocks(content)).toBe(content);
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
describe("redactSecrets", () => {
|
|
117
|
+
it("redacts JWT tokens", () => {
|
|
118
|
+
const jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
|
|
119
|
+
const content = `Bearer ${jwt}`;
|
|
120
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
121
|
+
|
|
122
|
+
expect(redacted).toContain("[JWT_REDACTED]");
|
|
123
|
+
expect(secretsFound).toContain("jwt");
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it("redacts AWS access keys", () => {
|
|
127
|
+
const content = "AWS Key: AKIAIOSFODNN7EXAMPLE";
|
|
128
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
129
|
+
|
|
130
|
+
expect(redacted).toContain("[AWSACCESSKEY_REDACTED]");
|
|
131
|
+
expect(secretsFound).toContain("awsAccessKey");
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it("redacts GitHub tokens", () => {
|
|
135
|
+
const content = "Token: ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
|
|
136
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
137
|
+
|
|
138
|
+
expect(redacted).toContain("[GITHUBTOKEN_REDACTED]");
|
|
139
|
+
expect(secretsFound).toContain("githubToken");
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it("preserves normal text", () => {
|
|
143
|
+
const content = "This is normal text without secrets.";
|
|
144
|
+
const { content: redacted, secretsFound } = redactSecrets(content);
|
|
145
|
+
|
|
146
|
+
expect(redacted).toBe(content);
|
|
147
|
+
expect(secretsFound).toHaveLength(0);
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
describe("filterContent", () => {
|
|
152
|
+
it("returns 'index' for clean content", () => {
|
|
153
|
+
// Content must be at least 50 chars (minContentLength default)
|
|
154
|
+
const content = "This is clean, normal content for indexing. It contains enough text to pass the minimum length requirement.";
|
|
155
|
+
const result = filterContent(content);
|
|
156
|
+
|
|
157
|
+
expect(result.action).toBe("index");
|
|
158
|
+
expect(result.cleanedContent).toBe(content);
|
|
159
|
+
expect(result.reasons).toHaveLength(0);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
it("returns 'skip' for binary content", () => {
|
|
163
|
+
const result = filterContent("Text\x00with null bytes");
|
|
164
|
+
|
|
165
|
+
expect(result.action).toBe("skip");
|
|
166
|
+
expect(result.reasons).toContain("Contains binary content");
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it("returns 'skip' for mostly Base64 content", () => {
|
|
170
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5".repeat(10);
|
|
171
|
+
const result = filterContent(base64);
|
|
172
|
+
|
|
173
|
+
expect(result.action).toBe("skip");
|
|
174
|
+
expect(result.reasons[0]).toContain("Base64 encoded");
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it("returns 'filter' for mixed content with Base64", () => {
|
|
178
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
|
|
179
|
+
const content = `This is important text. Token: ${base64}. More important content here that we want to index.`;
|
|
180
|
+
const result = filterContent(content);
|
|
181
|
+
|
|
182
|
+
expect(result.action).toBe("filter");
|
|
183
|
+
expect(result.cleanedContent).toContain("[ENCODED]");
|
|
184
|
+
expect(result.cleanedContent).toContain("This is important text");
|
|
185
|
+
expect(result.reasons.some(r => r.includes("Base64"))).toBe(true);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it("returns 'skip' if content too short after filtering", () => {
|
|
189
|
+
// Short text + Base64 that will be removed, leaving less than 50 chars
|
|
190
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
|
|
191
|
+
const content = `Hi ${base64}`;
|
|
192
|
+
const result = filterContent(content);
|
|
193
|
+
|
|
194
|
+
expect(result.action).toBe("skip");
|
|
195
|
+
// After removing Base64, only "Hi [ENCODED]" remains which is too short
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
it("respects custom configuration", () => {
|
|
199
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
|
|
200
|
+
// Need enough remaining content after potential filtering
|
|
201
|
+
const content = `This is some text before the token. Token: ${base64}. And this is some text after the token that should remain.`;
|
|
202
|
+
|
|
203
|
+
// With removeBase64 disabled, the Base64 should stay
|
|
204
|
+
const result = filterContent(content, { removeBase64: false });
|
|
205
|
+
|
|
206
|
+
expect(result.action).not.toBe("skip");
|
|
207
|
+
expect(result.cleanedContent).toContain(base64);
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
describe("shouldIndexContent", () => {
|
|
212
|
+
it("returns true for normal content", () => {
|
|
213
|
+
expect(shouldIndexContent("Normal text content")).toBe(true);
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
it("returns false for binary content", () => {
|
|
217
|
+
expect(shouldIndexContent("Binary\x00content")).toBe(false);
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
it("returns false for mostly Base64", () => {
|
|
221
|
+
const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5".repeat(10);
|
|
222
|
+
expect(shouldIndexContent(base64)).toBe(false);
|
|
223
|
+
});
|
|
224
|
+
});
|
|
225
|
+
});
|