@disco_trooper/apple-notes-mcp 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,170 @@
1
+ /**
2
+ * Text chunker with recursive character splitting that respects natural boundaries.
3
+ * Prioritizes splitting at: paragraphs > sentences > words > characters
4
+ */
5
+
6
+ /**
7
+ * Separators in priority order - prefer splitting at larger boundaries first
8
+ */
9
+ export const SEPARATORS = [
10
+ "\n\n", // Paragraph
11
+ "\n", // Line
12
+ ". ", // Sentence (period)
13
+ "! ", // Sentence (exclamation)
14
+ "? ", // Sentence (question)
15
+ "; ", // Clause
16
+ ", ", // Phrase
17
+ " ", // Word
18
+ "", // Character (fallback)
19
+ ] as const;
20
+
21
+ export interface ChunkOptions {
22
+ /** Maximum size of each chunk in characters */
23
+ chunkSize: number;
24
+ /** Number of characters to overlap between chunks */
25
+ overlap: number;
26
+ }
27
+
28
+ export interface ChunkResult {
29
+ /** The text content of this chunk */
30
+ content: string;
31
+ /** Zero-based index of this chunk */
32
+ index: number;
33
+ /** Total number of chunks */
34
+ totalChunks: number;
35
+ /** Start position in original text */
36
+ startPos: number;
37
+ /** End position in original text (exclusive) */
38
+ endPos: number;
39
+ }
40
+
41
+ export const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
42
+ chunkSize: 500,
43
+ overlap: 100,
44
+ };
45
+
46
+ /**
47
+ * Find the best split point near the target position.
48
+ * Searches for separators in priority order within a reasonable range.
49
+ *
50
+ * @param text - The full text to search in
51
+ * @param target - The target position to split near
52
+ * @returns The best split position (after the separator)
53
+ */
54
+ export function findSplitPoint(text: string, target: number): number {
55
+ // Search window: look backwards and forwards from target
56
+ const searchWindow = Math.min(50, Math.floor(target / 2));
57
+ const searchStart = Math.max(0, target - searchWindow);
58
+ const searchEnd = Math.min(text.length, target + searchWindow);
59
+ const searchText = text.slice(searchStart, searchEnd);
60
+
61
+ // Try each separator in priority order
62
+ for (const sep of SEPARATORS) {
63
+ if (sep === "") continue; // Skip empty string fallback for now
64
+
65
+ // Find all occurrences of separator in search window
66
+ let bestPos = -1;
67
+ let bestDistance = Infinity;
68
+
69
+ let idx = 0;
70
+ while ((idx = searchText.indexOf(sep, idx)) !== -1) {
71
+ const absolutePos = searchStart + idx + sep.length;
72
+ const distance = Math.abs(absolutePos - target);
73
+
74
+ if (distance < bestDistance) {
75
+ bestDistance = distance;
76
+ bestPos = absolutePos;
77
+ }
78
+ idx += 1;
79
+ }
80
+
81
+ if (bestPos !== -1) {
82
+ return bestPos;
83
+ }
84
+ }
85
+
86
+ // No separator found, return target as-is
87
+ return target;
88
+ }
89
+
90
+ /**
91
+ * Split text into overlapping chunks that respect natural boundaries.
92
+ *
93
+ * @param text - The text to chunk
94
+ * @param options - Chunk size and overlap options
95
+ * @returns Array of chunk results
96
+ */
97
+ export function chunkText(
98
+ text: string,
99
+ options: ChunkOptions = DEFAULT_CHUNK_OPTIONS
100
+ ): ChunkResult[] {
101
+ const { chunkSize, overlap } = options;
102
+
103
+ // Handle empty or whitespace-only text
104
+ const trimmed = text.trim();
105
+ if (trimmed.length === 0) {
106
+ return [];
107
+ }
108
+
109
+ // If text fits in a single chunk, return it
110
+ if (text.length <= chunkSize) {
111
+ return [
112
+ {
113
+ content: text,
114
+ index: 0,
115
+ totalChunks: 1,
116
+ startPos: 0,
117
+ endPos: text.length,
118
+ },
119
+ ];
120
+ }
121
+
122
+ const chunks: ChunkResult[] = [];
123
+ let startPos = 0;
124
+ // Minimum step size to ensure progress and avoid tiny chunks
125
+ const minStep = Math.max(1, chunkSize - overlap);
126
+
127
+ while (startPos < text.length) {
128
+ // Calculate target end position
129
+ let endPos = Math.min(startPos + chunkSize, text.length);
130
+
131
+ // If not at the end, find a good split point
132
+ if (endPos < text.length) {
133
+ const splitPoint = findSplitPoint(text, endPos);
134
+ // Only use split point if it creates a reasonably sized chunk
135
+ if (
136
+ splitPoint > startPos + minStep / 2 &&
137
+ splitPoint - startPos <= chunkSize * 1.2
138
+ ) {
139
+ endPos = splitPoint;
140
+ }
141
+ }
142
+
143
+ // Extract chunk content
144
+ const content = text.slice(startPos, endPos);
145
+
146
+ chunks.push({
147
+ content,
148
+ index: chunks.length,
149
+ totalChunks: 0, // Will be set after all chunks are created
150
+ startPos,
151
+ endPos,
152
+ });
153
+
154
+ // If we've reached the end, stop
155
+ if (endPos >= text.length) {
156
+ break;
157
+ }
158
+
159
+ // Move to next chunk - ensure minimum step for progress
160
+ startPos = startPos + minStep;
161
+ }
162
+
163
+ // Set totalChunks on all chunks
164
+ const totalChunks = chunks.length;
165
+ for (const chunk of chunks) {
166
+ chunk.totalChunks = totalChunks;
167
+ }
168
+
169
+ return chunks;
170
+ }
@@ -0,0 +1,225 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ calculateEntropy,
4
+ isLikelyBase64,
5
+ getBase64Ratio,
6
+ hasBinaryContent,
7
+ removeBase64Blocks,
8
+ redactSecrets,
9
+ filterContent,
10
+ shouldIndexContent,
11
+ } from "./content-filter.js";
12
+
13
+ describe("content-filter", () => {
14
+ describe("calculateEntropy", () => {
15
+ it("returns 0 for empty string", () => {
16
+ expect(calculateEntropy("")).toBe(0);
17
+ });
18
+
19
+ it("returns low entropy for repetitive text", () => {
20
+ const entropy = calculateEntropy("aaaaaaaaaa");
21
+ expect(entropy).toBe(0);
22
+ });
23
+
24
+ it("returns moderate entropy for normal text", () => {
25
+ const entropy = calculateEntropy("Hello, this is normal text.");
26
+ expect(entropy).toBeGreaterThan(2);
27
+ expect(entropy).toBeLessThan(5);
28
+ });
29
+
30
+ it("returns high entropy for Base64 content", () => {
31
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
32
+ const entropy = calculateEntropy(base64);
33
+ expect(entropy).toBeGreaterThan(4.5);
34
+ });
35
+ });
36
+
37
+ describe("isLikelyBase64", () => {
38
+ it("returns false for short strings", () => {
39
+ expect(isLikelyBase64("abc123")).toBe(false);
40
+ });
41
+
42
+ it("returns false for normal text", () => {
43
+ expect(isLikelyBase64("This is normal text with spaces and punctuation!")).toBe(false);
44
+ });
45
+
46
+ it("returns true for Base64 encoded content", () => {
47
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3MiOiJodHRwczovL2V4YW1wbGUu";
48
+ expect(isLikelyBase64(base64)).toBe(true);
49
+ });
50
+
51
+ it("returns true for URL-safe Base64", () => {
52
+ const urlSafe = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5_abc-def123456";
53
+ expect(isLikelyBase64(urlSafe)).toBe(true);
54
+ });
55
+ });
56
+
57
+ describe("getBase64Ratio", () => {
58
+ it("returns 0 for normal text", () => {
59
+ const ratio = getBase64Ratio("This is completely normal text.");
60
+ expect(ratio).toBe(0);
61
+ });
62
+
63
+ it("returns high ratio for mostly Base64 content", () => {
64
+ // Use actual high-entropy Base64, not repeated chars
65
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M".repeat(3);
66
+ const content = "Token: " + base64;
67
+ const ratio = getBase64Ratio(content);
68
+ expect(ratio).toBeGreaterThan(0.5);
69
+ });
70
+
71
+ it("returns partial ratio for mixed content", () => {
72
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
73
+ const content = `Normal text here. ${base64} More normal text.`;
74
+ const ratio = getBase64Ratio(content);
75
+ expect(ratio).toBeGreaterThan(0);
76
+ expect(ratio).toBeLessThan(0.7);
77
+ });
78
+ });
79
+
80
+ describe("hasBinaryContent", () => {
81
+ it("returns false for normal text", () => {
82
+ expect(hasBinaryContent("Normal text")).toBe(false);
83
+ });
84
+
85
+ it("returns false for text with newlines and tabs", () => {
86
+ expect(hasBinaryContent("Line 1\nLine 2\tTabbed")).toBe(false);
87
+ });
88
+
89
+ it("returns true for null bytes", () => {
90
+ expect(hasBinaryContent("Text\x00with null")).toBe(true);
91
+ });
92
+
93
+ it("returns true for control characters", () => {
94
+ expect(hasBinaryContent("Text\x03with control")).toBe(true);
95
+ });
96
+ });
97
+
98
+ describe("removeBase64Blocks", () => {
99
+ it("removes Base64 blocks from content", () => {
100
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
101
+ const content = `API Token: ${base64}\n\nNext section...`;
102
+ const result = removeBase64Blocks(content);
103
+
104
+ expect(result).not.toContain(base64);
105
+ expect(result).toContain("[ENCODED]");
106
+ expect(result).toContain("API Token:");
107
+ expect(result).toContain("Next section...");
108
+ });
109
+
110
+ it("preserves normal text", () => {
111
+ const content = "This is completely normal text without any encoding.";
112
+ expect(removeBase64Blocks(content)).toBe(content);
113
+ });
114
+ });
115
+
116
+ describe("redactSecrets", () => {
117
+ it("redacts JWT tokens", () => {
118
+ const jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
119
+ const content = `Bearer ${jwt}`;
120
+ const { content: redacted, secretsFound } = redactSecrets(content);
121
+
122
+ expect(redacted).toContain("[JWT_REDACTED]");
123
+ expect(secretsFound).toContain("jwt");
124
+ });
125
+
126
+ it("redacts AWS access keys", () => {
127
+ const content = "AWS Key: AKIAIOSFODNN7EXAMPLE";
128
+ const { content: redacted, secretsFound } = redactSecrets(content);
129
+
130
+ expect(redacted).toContain("[AWSACCESSKEY_REDACTED]");
131
+ expect(secretsFound).toContain("awsAccessKey");
132
+ });
133
+
134
+ it("redacts GitHub tokens", () => {
135
+ const content = "Token: ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
136
+ const { content: redacted, secretsFound } = redactSecrets(content);
137
+
138
+ expect(redacted).toContain("[GITHUBTOKEN_REDACTED]");
139
+ expect(secretsFound).toContain("githubToken");
140
+ });
141
+
142
+ it("preserves normal text", () => {
143
+ const content = "This is normal text without secrets.";
144
+ const { content: redacted, secretsFound } = redactSecrets(content);
145
+
146
+ expect(redacted).toBe(content);
147
+ expect(secretsFound).toHaveLength(0);
148
+ });
149
+ });
150
+
151
+ describe("filterContent", () => {
152
+ it("returns 'index' for clean content", () => {
153
+ // Content must be at least 50 chars (minContentLength default)
154
+ const content = "This is clean, normal content for indexing. It contains enough text to pass the minimum length requirement.";
155
+ const result = filterContent(content);
156
+
157
+ expect(result.action).toBe("index");
158
+ expect(result.cleanedContent).toBe(content);
159
+ expect(result.reasons).toHaveLength(0);
160
+ });
161
+
162
+ it("returns 'skip' for binary content", () => {
163
+ const result = filterContent("Text\x00with null bytes");
164
+
165
+ expect(result.action).toBe("skip");
166
+ expect(result.reasons).toContain("Contains binary content");
167
+ });
168
+
169
+ it("returns 'skip' for mostly Base64 content", () => {
170
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5".repeat(10);
171
+ const result = filterContent(base64);
172
+
173
+ expect(result.action).toBe("skip");
174
+ expect(result.reasons[0]).toContain("Base64 encoded");
175
+ });
176
+
177
+ it("returns 'filter' for mixed content with Base64", () => {
178
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5";
179
+ const content = `This is important text. Token: ${base64}. More important content here that we want to index.`;
180
+ const result = filterContent(content);
181
+
182
+ expect(result.action).toBe("filter");
183
+ expect(result.cleanedContent).toContain("[ENCODED]");
184
+ expect(result.cleanedContent).toContain("This is important text");
185
+ expect(result.reasons.some(r => r.includes("Base64"))).toBe(true);
186
+ });
187
+
188
+ it("returns 'skip' if content too short after filtering", () => {
189
+ // Short text + Base64 that will be removed, leaving less than 50 chars
190
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
191
+ const content = `Hi ${base64}`;
192
+ const result = filterContent(content);
193
+
194
+ expect(result.action).toBe("skip");
195
+ // After removing Base64, only "Hi [ENCODED]" remains which is too short
196
+ });
197
+
198
+ it("respects custom configuration", () => {
199
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5eyJpc3M";
200
+ // Need enough remaining content after potential filtering
201
+ const content = `This is some text before the token. Token: ${base64}. And this is some text after the token that should remain.`;
202
+
203
+ // With removeBase64 disabled, the Base64 should stay
204
+ const result = filterContent(content, { removeBase64: false });
205
+
206
+ expect(result.action).not.toBe("skip");
207
+ expect(result.cleanedContent).toContain(base64);
208
+ });
209
+ });
210
+
211
+ describe("shouldIndexContent", () => {
212
+ it("returns true for normal content", () => {
213
+ expect(shouldIndexContent("Normal text content")).toBe(true);
214
+ });
215
+
216
+ it("returns false for binary content", () => {
217
+ expect(shouldIndexContent("Binary\x00content")).toBe(false);
218
+ });
219
+
220
+ it("returns false for mostly Base64", () => {
221
+ const base64 = "ZXlKMGVYQWlPaUpLVjFRaUxDSmhiR2NpT2lKU1V6STFOaUo5".repeat(10);
222
+ expect(shouldIndexContent(base64)).toBe(false);
223
+ });
224
+ });
225
+ });
@@ -0,0 +1,275 @@
1
+ /**
2
+ * Content quality filter for RAG indexing.
3
+ * Detects and filters Base64-encoded, binary, and secret content.
4
+ */
5
+
6
+ import { createDebugLogger } from "./debug.js";
7
+
8
+ const debug = createDebugLogger("CONTENT_FILTER");
9
+
10
+ /**
11
+ * Result of content filtering.
12
+ */
13
+ export interface FilterResult {
14
+ /** Whether to index this content */
15
+ action: "index" | "filter" | "skip";
16
+ /** Cleaned content (if action is "index" or "filter") */
17
+ cleanedContent?: string;
18
+ /** Reasons for filtering/skipping */
19
+ reasons: string[];
20
+ }
21
+
22
+ /**
23
+ * Calculate Shannon entropy of a string.
24
+ * Higher entropy = more random/encoded content.
25
+ *
26
+ * Typical values:
27
+ * - Normal text: 0.8 - 4.5
28
+ * - Base64: 5.0 - 6.0
29
+ * - Encrypted: 6.0+
30
+ *
31
+ * @param str - String to analyze
32
+ * @returns Entropy value (0-8)
33
+ */
34
+ export function calculateEntropy(str: string): number {
35
+ if (!str || str.length === 0) return 0;
36
+
37
+ const freq = new Map<string, number>();
38
+ for (const char of str) {
39
+ freq.set(char, (freq.get(char) || 0) + 1);
40
+ }
41
+
42
+ let entropy = 0;
43
+ const len = str.length;
44
+ for (const count of freq.values()) {
45
+ const p = count / len;
46
+ entropy -= p * Math.log2(p);
47
+ }
48
+
49
+ return entropy;
50
+ }
51
+
52
+ /**
53
+ * Regex pattern for Base64 content (40+ chars).
54
+ */
55
+ const BASE64_PATTERN = /[A-Za-z0-9+/]{40,}={0,2}/g;
56
+
57
+ /**
58
+ * Regex pattern for URL-safe Base64.
59
+ */
60
+ const BASE64_URL_SAFE_PATTERN = /[A-Za-z0-9_-]{40,}={0,2}/g;
61
+
62
+ /**
63
+ * Patterns for common secrets/tokens.
64
+ */
65
+ const SECRET_PATTERNS: Record<string, RegExp> = {
66
+ // Private Keys
67
+ privateKey: /-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY(?: BLOCK)?-----/,
68
+
69
+ // JWT tokens
70
+ jwt: /eyJ[A-Za-z0-9-_=]+\.[A-Za-z0-9-_=]+\.?[A-Za-z0-9-_.+/=]*/g,
71
+
72
+ // AWS
73
+ awsAccessKey: /AKIA[0-9A-Z]{16}/g,
74
+
75
+ // GitHub
76
+ githubToken: /ghp_[a-zA-Z0-9]{36}/g,
77
+ githubFineGrained: /github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}/g,
78
+
79
+ // Slack
80
+ slackToken: /xox[baprs]-[0-9a-zA-Z]{10,48}/g,
81
+
82
+ // Stripe
83
+ stripeKey: /sk_live_[0-9a-zA-Z]{24}/g,
84
+
85
+ // Database URIs with credentials
86
+ dbUri: /(?:mongodb|postgres(?:ql)?|mysql|redis):\/\/[^\s'"]+:[^\s'"]+@[^\s'"]+/g,
87
+ };
88
+
89
+ /**
90
+ * Check if a string segment is likely Base64 encoded.
91
+ */
92
+ export function isLikelyBase64(str: string): boolean {
93
+ // Minimum length check
94
+ if (str.length < 40) return false;
95
+
96
+ // Check if only Base64 characters
97
+ if (!/^[A-Za-z0-9+/=_-]+$/.test(str)) return false;
98
+
99
+ // Check entropy - Base64 typically has high entropy
100
+ const entropy = calculateEntropy(str);
101
+ return entropy > 4.5;
102
+ }
103
+
104
+ /**
105
+ * Calculate the ratio of Base64-like content in a string.
106
+ */
107
+ export function getBase64Ratio(content: string): number {
108
+ const matches = content.match(BASE64_PATTERN) || [];
109
+ const urlSafeMatches = content.match(BASE64_URL_SAFE_PATTERN) || [];
110
+
111
+ // Combine and deduplicate
112
+ const allMatches = new Set([...matches, ...urlSafeMatches]);
113
+
114
+ let totalBase64Length = 0;
115
+ for (const match of allMatches) {
116
+ if (isLikelyBase64(match)) {
117
+ totalBase64Length += match.length;
118
+ }
119
+ }
120
+
121
+ return content.length > 0 ? totalBase64Length / content.length : 0;
122
+ }
123
+
124
+ /**
125
+ * Check if content contains binary/control characters.
126
+ */
127
+ export function hasBinaryContent(content: string): boolean {
128
+ // Check for null bytes or control characters (except newlines/tabs)
129
+ return /[\x00-\x08\x0B\x0C\x0E-\x1F]/.test(content);
130
+ }
131
+
132
+ /**
133
+ * Remove Base64 blocks from content.
134
+ */
135
+ export function removeBase64Blocks(content: string): string {
136
+ let result = content;
137
+
138
+ // Remove standard Base64
139
+ result = result.replace(BASE64_PATTERN, (match) => {
140
+ if (isLikelyBase64(match)) {
141
+ return "[ENCODED]";
142
+ }
143
+ return match;
144
+ });
145
+
146
+ // Remove URL-safe Base64
147
+ result = result.replace(BASE64_URL_SAFE_PATTERN, (match) => {
148
+ if (isLikelyBase64(match)) {
149
+ return "[ENCODED]";
150
+ }
151
+ return match;
152
+ });
153
+
154
+ return result;
155
+ }
156
+
157
+ /**
158
+ * Redact detected secrets in content.
159
+ */
160
+ export function redactSecrets(content: string): { content: string; secretsFound: string[] } {
161
+ let result = content;
162
+ const secretsFound: string[] = [];
163
+
164
+ for (const [name, pattern] of Object.entries(SECRET_PATTERNS)) {
165
+ if (pattern.test(result)) {
166
+ // Reset lastIndex for global patterns
167
+ pattern.lastIndex = 0;
168
+ result = result.replace(pattern, `[${name.toUpperCase()}_REDACTED]`);
169
+ secretsFound.push(name);
170
+ }
171
+ }
172
+
173
+ return { content: result, secretsFound };
174
+ }
175
+
176
+ /**
177
+ * Configuration for content filtering.
178
+ */
179
+ export interface FilterConfig {
180
+ /** Maximum Base64 ratio before skipping (default: 0.5) */
181
+ maxBase64Ratio?: number;
182
+ /** Minimum meaningful content length after filtering (default: 50) */
183
+ minContentLength?: number;
184
+ /** Whether to redact secrets (default: true) */
185
+ redactSecrets?: boolean;
186
+ /** Whether to remove Base64 blocks (default: true) */
187
+ removeBase64?: boolean;
188
+ }
189
+
190
+ const DEFAULT_CONFIG: Required<FilterConfig> = {
191
+ maxBase64Ratio: 0.5,
192
+ minContentLength: 50,
193
+ redactSecrets: true,
194
+ removeBase64: true,
195
+ };
196
+
197
+ /**
198
+ * Filter content for RAG indexing.
199
+ *
200
+ * @param content - Raw content to filter
201
+ * @param config - Filter configuration
202
+ * @returns Filter result with action and cleaned content
203
+ */
204
+ export function filterContent(
205
+ content: string,
206
+ config: FilterConfig = {}
207
+ ): FilterResult {
208
+ const cfg = { ...DEFAULT_CONFIG, ...config };
209
+ const reasons: string[] = [];
210
+
211
+ // 1. Check for binary content - skip entirely
212
+ if (hasBinaryContent(content)) {
213
+ debug("Skipping content with binary characters");
214
+ return { action: "skip", reasons: ["Contains binary content"] };
215
+ }
216
+
217
+ // 2. Calculate Base64 ratio
218
+ const base64Ratio = getBase64Ratio(content);
219
+ debug(`Base64 ratio: ${(base64Ratio * 100).toFixed(1)}%`);
220
+
221
+ // Skip if too much encoded content
222
+ if (base64Ratio > cfg.maxBase64Ratio) {
223
+ debug(`Skipping content: ${(base64Ratio * 100).toFixed(1)}% Base64`);
224
+ return {
225
+ action: "skip",
226
+ reasons: [`${(base64Ratio * 100).toFixed(1)}% is Base64 encoded (threshold: ${(cfg.maxBase64Ratio * 100).toFixed(0)}%)`],
227
+ };
228
+ }
229
+
230
+ let cleanedContent = content;
231
+
232
+ // 3. Remove Base64 blocks if present and configured
233
+ if (cfg.removeBase64 && base64Ratio > 0.1) {
234
+ cleanedContent = removeBase64Blocks(cleanedContent);
235
+ reasons.push("Removed Base64 blocks");
236
+ }
237
+
238
+ // 4. Redact secrets if configured
239
+ if (cfg.redactSecrets) {
240
+ const { content: redacted, secretsFound } = redactSecrets(cleanedContent);
241
+ if (secretsFound.length > 0) {
242
+ cleanedContent = redacted;
243
+ reasons.push(`Redacted secrets: ${secretsFound.join(", ")}`);
244
+ }
245
+ }
246
+
247
+ // 5. Check if remaining content is meaningful
248
+ const meaningfulContent = cleanedContent
249
+ .replace(/\[.*?_REDACTED\]|\[ENCODED\]/g, "")
250
+ .trim();
251
+
252
+ if (meaningfulContent.length < cfg.minContentLength) {
253
+ debug(`Skipping: insufficient content after filtering (${meaningfulContent.length} chars)`);
254
+ return {
255
+ action: "skip",
256
+ reasons: ["Insufficient meaningful content after filtering"],
257
+ };
258
+ }
259
+
260
+ // Determine action
261
+ const action = reasons.length > 0 ? "filter" : "index";
262
+
263
+ return { action, cleanedContent, reasons };
264
+ }
265
+
266
+ /**
267
+ * Quick check if content should be indexed.
268
+ * Use this for fast pre-filtering before chunking.
269
+ */
270
+ export function shouldIndexContent(content: string): boolean {
271
+ // Quick checks
272
+ if (hasBinaryContent(content)) return false;
273
+ if (getBase64Ratio(content) > 0.5) return false;
274
+ return true;
275
+ }