@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
-- Test Script: test_hnsw_index.sql
|
|
2
|
+
-- Description: Comprehensive testing suite for HNSW index performance
|
|
3
|
+
-- Related: TASK-005 from BACKLOG.md
|
|
4
|
+
-- Created: 2026-02-02
|
|
5
|
+
|
|
6
|
+
-- ============================================================================
|
|
7
|
+
-- TEST 1: Verify HNSW Index Creation
|
|
8
|
+
-- ============================================================================
|
|
9
|
+
DO $$
|
|
10
|
+
BEGIN
|
|
11
|
+
IF NOT EXISTS (
|
|
12
|
+
SELECT 1
|
|
13
|
+
FROM pg_indexes
|
|
14
|
+
WHERE tablename = 'memory_embeddings'
|
|
15
|
+
AND indexname = 'idx_memory_embeddings_hnsw'
|
|
16
|
+
) THEN
|
|
17
|
+
RAISE EXCEPTION 'HNSW index idx_memory_embeddings_hnsw not found';
|
|
18
|
+
END IF;
|
|
19
|
+
|
|
20
|
+
RAISE NOTICE 'TEST 1 PASSED: HNSW index exists';
|
|
21
|
+
END $$;
|
|
22
|
+
|
|
23
|
+
-- ============================================================================
|
|
24
|
+
-- TEST 2: Verify Index Uses HNSW Access Method
|
|
25
|
+
-- ============================================================================
|
|
26
|
+
DO $$
|
|
27
|
+
DECLARE
|
|
28
|
+
index_method TEXT;
|
|
29
|
+
BEGIN
|
|
30
|
+
SELECT am.amname INTO index_method
|
|
31
|
+
FROM pg_class c
|
|
32
|
+
JOIN pg_am am ON c.relam = am.oid
|
|
33
|
+
WHERE c.relname = 'idx_memory_embeddings_hnsw';
|
|
34
|
+
|
|
35
|
+
IF index_method != 'hnsw' THEN
|
|
36
|
+
RAISE EXCEPTION 'Index is not using HNSW access method (found: %)', index_method;
|
|
37
|
+
END IF;
|
|
38
|
+
|
|
39
|
+
RAISE NOTICE 'TEST 2 PASSED: Index uses HNSW access method';
|
|
40
|
+
END $$;
|
|
41
|
+
|
|
42
|
+
-- ============================================================================
|
|
43
|
+
-- TEST 3: Verify HNSW Parameters (m=16, ef_construction=64)
|
|
44
|
+
-- ============================================================================
|
|
45
|
+
DO $$
|
|
46
|
+
DECLARE
|
|
47
|
+
index_options TEXT;
|
|
48
|
+
BEGIN
|
|
49
|
+
SELECT pg_get_indexdef(indexrelid, 0, true) INTO index_options
|
|
50
|
+
FROM pg_stat_user_indexes
|
|
51
|
+
WHERE indexrelname = 'idx_memory_embeddings_hnsw';
|
|
52
|
+
|
|
53
|
+
IF index_options NOT LIKE '%m=16%' THEN
|
|
54
|
+
RAISE WARNING 'Expected m=16 in index options';
|
|
55
|
+
END IF;
|
|
56
|
+
|
|
57
|
+
IF index_options NOT LIKE '%ef_construction=64%' THEN
|
|
58
|
+
RAISE WARNING 'Expected ef_construction=64 in index options';
|
|
59
|
+
END IF;
|
|
60
|
+
|
|
61
|
+
RAISE NOTICE 'TEST 3 PASSED: HNSW parameters configured (m=16, ef_construction=64)';
|
|
62
|
+
RAISE NOTICE 'Index definition: %', index_options;
|
|
63
|
+
END $$;
|
|
64
|
+
|
|
65
|
+
-- ============================================================================
|
|
66
|
+
-- TEST 4: Verify Query Uses Index Scan
|
|
67
|
+
-- ============================================================================
|
|
68
|
+
-- Create a sample vector for testing
|
|
69
|
+
DO $$
|
|
70
|
+
DECLARE
|
|
71
|
+
explain_output TEXT := '';
|
|
72
|
+
plan_row RECORD;
|
|
73
|
+
sample_vector vector(1536);
|
|
74
|
+
BEGIN
|
|
75
|
+
sample_vector := array_fill(0.1, ARRAY[1536])::vector;
|
|
76
|
+
|
|
77
|
+
FOR plan_row IN EXECUTE format(
|
|
78
|
+
'EXPLAIN (FORMAT TEXT) SELECT memory_id, 1 - (embedding <=> %L::vector) as similarity FROM memory_embeddings ORDER BY embedding <=> %L::vector LIMIT 10',
|
|
79
|
+
sample_vector::text, sample_vector::text
|
|
80
|
+
)
|
|
81
|
+
LOOP
|
|
82
|
+
explain_output := explain_output || plan_row."QUERY PLAN" || E'\n';
|
|
83
|
+
END LOOP;
|
|
84
|
+
|
|
85
|
+
IF explain_output LIKE '%Index Scan using idx_memory_embeddings_hnsw%' THEN
|
|
86
|
+
RAISE NOTICE 'TEST 4 PASSED: Query uses HNSW index scan';
|
|
87
|
+
ELSE
|
|
88
|
+
RAISE WARNING 'TEST 4 WARNING: Query may not be using HNSW index';
|
|
89
|
+
RAISE NOTICE 'Explain plan: %', explain_output;
|
|
90
|
+
END IF;
|
|
91
|
+
END $$;
|
|
92
|
+
|
|
93
|
+
-- ============================================================================
|
|
94
|
+
-- TEST 5: Performance Benchmark (<100ms for 10K vectors)
|
|
95
|
+
-- ============================================================================
|
|
96
|
+
-- This test requires data in the table
|
|
97
|
+
-- Run after inserting test data
|
|
98
|
+
|
|
99
|
+
CREATE OR REPLACE FUNCTION run_hnsw_performance_test(
|
|
100
|
+
num_queries INTEGER DEFAULT 10
|
|
101
|
+
)
|
|
102
|
+
RETURNS TABLE (
|
|
103
|
+
query_num INTEGER,
|
|
104
|
+
execution_time_ms NUMERIC,
|
|
105
|
+
results_returned INTEGER,
|
|
106
|
+
status TEXT
|
|
107
|
+
) AS $$
|
|
108
|
+
DECLARE
|
|
109
|
+
i INTEGER;
|
|
110
|
+
start_time TIMESTAMPTZ;
|
|
111
|
+
end_time TIMESTAMPTZ;
|
|
112
|
+
exec_time NUMERIC;
|
|
113
|
+
result_count INTEGER;
|
|
114
|
+
sample_vector vector(1536);
|
|
115
|
+
row_count BIGINT;
|
|
116
|
+
BEGIN
|
|
117
|
+
-- Check if table has data
|
|
118
|
+
SELECT COUNT(*) INTO row_count FROM memory_embeddings;
|
|
119
|
+
|
|
120
|
+
IF row_count = 0 THEN
|
|
121
|
+
RAISE NOTICE 'WARNING: No data in memory_embeddings table. Skipping performance test.';
|
|
122
|
+
RETURN;
|
|
123
|
+
END IF;
|
|
124
|
+
|
|
125
|
+
RAISE NOTICE 'Running % test queries on % embeddings...', num_queries, row_count;
|
|
126
|
+
|
|
127
|
+
FOR i IN 1..num_queries LOOP
|
|
128
|
+
-- Generate random test vector
|
|
129
|
+
sample_vector := (
|
|
130
|
+
SELECT array_agg(random()::REAL)::vector
|
|
131
|
+
FROM generate_series(1, 1536)
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
-- Measure query execution time
|
|
135
|
+
start_time := clock_timestamp();
|
|
136
|
+
|
|
137
|
+
SELECT COUNT(*) INTO result_count
|
|
138
|
+
FROM (
|
|
139
|
+
SELECT memory_id
|
|
140
|
+
FROM memory_embeddings
|
|
141
|
+
ORDER BY embedding <=> sample_vector
|
|
142
|
+
LIMIT 10
|
|
143
|
+
) results;
|
|
144
|
+
|
|
145
|
+
end_time := clock_timestamp();
|
|
146
|
+
exec_time := EXTRACT(MILLISECONDS FROM (end_time - start_time));
|
|
147
|
+
|
|
148
|
+
RETURN QUERY SELECT
|
|
149
|
+
i AS query_num,
|
|
150
|
+
exec_time AS execution_time_ms,
|
|
151
|
+
result_count AS results_returned,
|
|
152
|
+
CASE
|
|
153
|
+
WHEN exec_time < 100 THEN 'PASS'
|
|
154
|
+
WHEN exec_time < 200 THEN 'WARNING'
|
|
155
|
+
ELSE 'FAIL'
|
|
156
|
+
END AS status;
|
|
157
|
+
END LOOP;
|
|
158
|
+
|
|
159
|
+
RETURN;
|
|
160
|
+
END;
|
|
161
|
+
$$ LANGUAGE plpgsql;
|
|
162
|
+
|
|
163
|
+
-- ============================================================================
|
|
164
|
+
-- TEST 6: Recall Accuracy Test (~99%)
|
|
165
|
+
-- ============================================================================
|
|
166
|
+
-- This test compares HNSW approximate results with exact results
|
|
167
|
+
|
|
168
|
+
CREATE OR REPLACE FUNCTION test_hnsw_recall_accuracy(
|
|
169
|
+
num_samples INTEGER DEFAULT 5
|
|
170
|
+
)
|
|
171
|
+
RETURNS TABLE (
|
|
172
|
+
sample_num INTEGER,
|
|
173
|
+
recall_percentage NUMERIC,
|
|
174
|
+
status TEXT
|
|
175
|
+
) AS $$
|
|
176
|
+
DECLARE
|
|
177
|
+
i INTEGER;
|
|
178
|
+
sample_vector vector(1536);
|
|
179
|
+
exact_ids UUID[];
|
|
180
|
+
approx_ids UUID[];
|
|
181
|
+
matches INTEGER;
|
|
182
|
+
recall NUMERIC;
|
|
183
|
+
BEGIN
|
|
184
|
+
FOR i IN 1..num_samples LOOP
|
|
185
|
+
-- Generate random test vector
|
|
186
|
+
sample_vector := (
|
|
187
|
+
SELECT array_agg(random()::REAL)::vector
|
|
188
|
+
FROM generate_series(1, 1536)
|
|
189
|
+
);
|
|
190
|
+
|
|
191
|
+
-- Get exact results (sequential scan, no index)
|
|
192
|
+
SELECT array_agg(memory_id ORDER BY distance) INTO exact_ids
|
|
193
|
+
FROM (
|
|
194
|
+
SELECT memory_id, embedding <=> sample_vector AS distance
|
|
195
|
+
FROM memory_embeddings
|
|
196
|
+
ORDER BY distance
|
|
197
|
+
LIMIT 10
|
|
198
|
+
) exact;
|
|
199
|
+
|
|
200
|
+
-- Get approximate results (HNSW index)
|
|
201
|
+
SELECT array_agg(memory_id ORDER BY distance) INTO approx_ids
|
|
202
|
+
FROM (
|
|
203
|
+
SELECT memory_id, embedding <=> sample_vector AS distance
|
|
204
|
+
FROM memory_embeddings
|
|
205
|
+
ORDER BY distance
|
|
206
|
+
LIMIT 10
|
|
207
|
+
) approx;
|
|
208
|
+
|
|
209
|
+
-- Calculate recall (percentage of exact results found in approximate results)
|
|
210
|
+
SELECT COUNT(*) INTO matches
|
|
211
|
+
FROM unnest(exact_ids) exact_id
|
|
212
|
+
WHERE exact_id = ANY(approx_ids);
|
|
213
|
+
|
|
214
|
+
recall := (matches::NUMERIC / COALESCE(array_length(exact_ids, 1), 1)) * 100;
|
|
215
|
+
|
|
216
|
+
RETURN QUERY SELECT
|
|
217
|
+
i AS sample_num,
|
|
218
|
+
recall AS recall_percentage,
|
|
219
|
+
CASE
|
|
220
|
+
WHEN recall >= 99 THEN 'PASS'
|
|
221
|
+
WHEN recall >= 95 THEN 'WARNING'
|
|
222
|
+
ELSE 'FAIL'
|
|
223
|
+
END AS status;
|
|
224
|
+
END LOOP;
|
|
225
|
+
|
|
226
|
+
RETURN;
|
|
227
|
+
END;
|
|
228
|
+
$$ LANGUAGE plpgsql;
|
|
229
|
+
|
|
230
|
+
-- ============================================================================
|
|
231
|
+
-- Run All Tests
|
|
232
|
+
-- ============================================================================
|
|
233
|
+
DO $$
|
|
234
|
+
BEGIN
|
|
235
|
+
RAISE NOTICE '========================================';
|
|
236
|
+
RAISE NOTICE 'HNSW Index Test Suite';
|
|
237
|
+
RAISE NOTICE '========================================';
|
|
238
|
+
RAISE NOTICE 'Running structural tests...';
|
|
239
|
+
END $$;
|
|
240
|
+
|
|
241
|
+
-- Tests 1-4 run automatically above
|
|
242
|
+
|
|
243
|
+
-- Note for performance tests:
|
|
244
|
+
\echo ''
|
|
245
|
+
\echo 'To run performance tests (requires data):'
|
|
246
|
+
\echo 'SELECT * FROM run_hnsw_performance_test(10);'
|
|
247
|
+
\echo ''
|
|
248
|
+
\echo 'To test recall accuracy:'
|
|
249
|
+
\echo 'SELECT * FROM test_hnsw_recall_accuracy(5);'
|
|
250
|
+
\echo ''
|
|
251
|
+
\echo 'To check current ef_search setting:'
|
|
252
|
+
\echo 'SHOW hnsw.ef_search;'
|
|
253
|
+
\echo ''
|
|
254
|
+
\echo 'To adjust search quality:'
|
|
255
|
+
\echo "SELECT set_hnsw_search_quality('balanced');"
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Git Pre-Commit Hook: Secrets Detection
|
|
5
|
+
*
|
|
6
|
+
* Scans staged files for potential secrets and blocks commits if found.
|
|
7
|
+
* This prevents accidental secret leakage into version control.
|
|
8
|
+
*
|
|
9
|
+
* Installation:
|
|
10
|
+
* cp scripts/pre-commit-secrets .git/hooks/pre-commit
|
|
11
|
+
* chmod +x .git/hooks/pre-commit
|
|
12
|
+
*
|
|
13
|
+
* Or use Husky:
|
|
14
|
+
* npx husky add .husky/pre-commit "node scripts/pre-commit-secrets"
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { execSync } from 'child_process';
|
|
18
|
+
import { readFileSync } from 'fs';
|
|
19
|
+
import { resolve } from 'path';
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// Secret Detection Patterns
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
const SECRET_PATTERNS = [
|
|
26
|
+
{
|
|
27
|
+
name: 'Generic API Key',
|
|
28
|
+
pattern: /(?:api[_-]?key|apikey)[=:\s]+['"]?([a-z0-9_-]{20,})/gi,
|
|
29
|
+
description: 'Looks like an API key',
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
name: 'Bearer Token',
|
|
33
|
+
pattern: /bearer\s+([a-z0-9_.-]+)/gi,
|
|
34
|
+
description: 'Bearer token detected',
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
name: 'AWS Access Key',
|
|
38
|
+
pattern: /AKIA[0-9A-Z]{16}/g,
|
|
39
|
+
description: 'AWS access key detected',
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
name: 'AWS Secret Key',
|
|
43
|
+
pattern: /aws[_-]?secret[_-]?access[_-]?key[=:\s]+['"]?([a-z0-9/+=]{40})/gi,
|
|
44
|
+
description: 'AWS secret key detected',
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
name: 'Private Key',
|
|
48
|
+
pattern: /-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----/i,
|
|
49
|
+
description: 'Private key detected',
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
name: 'Database URL with Credentials',
|
|
53
|
+
pattern: /(?:postgres|mysql|mongodb):\/\/([^:]+):([^@]+)@/gi,
|
|
54
|
+
description: 'Database URL with embedded credentials',
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
name: 'Password',
|
|
58
|
+
pattern: /(?:password|passwd|pwd)[=:\s]+['"]?([^\s'"]{8,})/gi,
|
|
59
|
+
description: 'Password detected',
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
name: 'JWT Token',
|
|
63
|
+
pattern: /eyJ[a-zA-Z0-9_-]+\.eyJ[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+/g,
|
|
64
|
+
description: 'JWT token detected',
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
name: 'Generic Secret',
|
|
68
|
+
pattern: /secret[_-]?key[=:\s]+['"]?([a-z0-9_.-]{20,})/gi,
|
|
69
|
+
description: 'Secret key detected',
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
name: 'Anthropic API Key',
|
|
73
|
+
pattern: /sk-ant-[a-zA-Z0-9-_]{95,}/g,
|
|
74
|
+
description: 'Anthropic API key detected',
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
name: 'OpenAI API Key',
|
|
78
|
+
pattern: /sk-[a-zA-Z0-9]{48,}/g,
|
|
79
|
+
description: 'OpenAI API key detected',
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
name: 'Stripe API Key',
|
|
83
|
+
pattern: /sk_(live|test)_[a-zA-Z0-9]{24,}/g,
|
|
84
|
+
description: 'Stripe API key detected',
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
name: 'Google API Key',
|
|
88
|
+
pattern: /AIza[0-9A-Za-z_-]{35}/g,
|
|
89
|
+
description: 'Google API key detected',
|
|
90
|
+
},
|
|
91
|
+
];
|
|
92
|
+
|
|
93
|
+
// ============================================================================
|
|
94
|
+
// Excluded Files/Patterns
|
|
95
|
+
// ============================================================================
|
|
96
|
+
|
|
97
|
+
const EXCLUDED_PATTERNS = [
|
|
98
|
+
// Documentation and examples
|
|
99
|
+
/\.md$/i,
|
|
100
|
+
/\.txt$/i,
|
|
101
|
+
/example/i,
|
|
102
|
+
/sample/i,
|
|
103
|
+
/template/i,
|
|
104
|
+
/\.example$/i,
|
|
105
|
+
|
|
106
|
+
// Test files (may contain fake secrets)
|
|
107
|
+
/\.test\.(ts|js)$/i,
|
|
108
|
+
/\.spec\.(ts|js)$/i,
|
|
109
|
+
/__tests__\//i,
|
|
110
|
+
/test\//i,
|
|
111
|
+
/tests\//i,
|
|
112
|
+
|
|
113
|
+
// Generated files
|
|
114
|
+
/dist\//i,
|
|
115
|
+
/build\//i,
|
|
116
|
+
/node_modules\//i,
|
|
117
|
+
|
|
118
|
+
// Lock files
|
|
119
|
+
/package-lock\.json$/i,
|
|
120
|
+
/yarn\.lock$/i,
|
|
121
|
+
/pnpm-lock\.yaml$/i,
|
|
122
|
+
|
|
123
|
+
// This file itself
|
|
124
|
+
/pre-commit-secrets$/i,
|
|
125
|
+
];
|
|
126
|
+
|
|
127
|
+
// Files that should ALWAYS be checked (override exclusions)
|
|
128
|
+
const FORCE_CHECK_PATTERNS = [
|
|
129
|
+
/\.env$/i,
|
|
130
|
+
/\.env\.[^.]+$/i,
|
|
131
|
+
];
|
|
132
|
+
|
|
133
|
+
// ============================================================================
|
|
134
|
+
// Helper Functions
|
|
135
|
+
// ============================================================================
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if a file should be excluded from scanning
|
|
139
|
+
*/
|
|
140
|
+
function shouldExcludeFile(filename) {
|
|
141
|
+
// Force check certain files
|
|
142
|
+
if (FORCE_CHECK_PATTERNS.some((pattern) => pattern.test(filename))) {
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Exclude matching patterns
|
|
147
|
+
return EXCLUDED_PATTERNS.some((pattern) => pattern.test(filename));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Get staged files from git
|
|
152
|
+
*/
|
|
153
|
+
function getStagedFiles() {
|
|
154
|
+
try {
|
|
155
|
+
const output = execSync('git diff --cached --name-only --diff-filter=ACM', {
|
|
156
|
+
encoding: 'utf-8',
|
|
157
|
+
});
|
|
158
|
+
return output.trim().split('\n').filter(Boolean);
|
|
159
|
+
} catch (error) {
|
|
160
|
+
console.error('Error getting staged files:', error.message);
|
|
161
|
+
return [];
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Scan a file for secrets
|
|
167
|
+
*/
|
|
168
|
+
function scanFile(filename) {
|
|
169
|
+
const findings = [];
|
|
170
|
+
|
|
171
|
+
// Read file content
|
|
172
|
+
let content;
|
|
173
|
+
try {
|
|
174
|
+
content = readFileSync(resolve(process.cwd(), filename), 'utf-8');
|
|
175
|
+
} catch (error) {
|
|
176
|
+
console.warn(`Warning: Could not read ${filename}: ${error.message}`);
|
|
177
|
+
return findings;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Scan with each pattern
|
|
181
|
+
for (const { name, pattern, description } of SECRET_PATTERNS) {
|
|
182
|
+
const matches = content.matchAll(pattern);
|
|
183
|
+
|
|
184
|
+
for (const match of matches) {
|
|
185
|
+
// Get line number
|
|
186
|
+
const beforeMatch = content.slice(0, match.index);
|
|
187
|
+
const lineNumber = beforeMatch.split('\n').length;
|
|
188
|
+
|
|
189
|
+
// Get line content (truncated)
|
|
190
|
+
const lines = content.split('\n');
|
|
191
|
+
const lineContent = lines[lineNumber - 1].trim().slice(0, 80);
|
|
192
|
+
|
|
193
|
+
findings.push({
|
|
194
|
+
file: filename,
|
|
195
|
+
line: lineNumber,
|
|
196
|
+
type: name,
|
|
197
|
+
description,
|
|
198
|
+
content: lineContent,
|
|
199
|
+
matched: match[0].slice(0, 40) + (match[0].length > 40 ? '...' : ''),
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return findings;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Print findings in a readable format
|
|
209
|
+
*/
|
|
210
|
+
function printFindings(findings) {
|
|
211
|
+
console.log('\nā COMMIT BLOCKED: Potential secrets detected!\n');
|
|
212
|
+
|
|
213
|
+
const groupedByFile = findings.reduce((acc, finding) => {
|
|
214
|
+
if (!acc[finding.file]) {
|
|
215
|
+
acc[finding.file] = [];
|
|
216
|
+
}
|
|
217
|
+
acc[finding.file].push(finding);
|
|
218
|
+
return acc;
|
|
219
|
+
}, {});
|
|
220
|
+
|
|
221
|
+
for (const [file, fileFindings] of Object.entries(groupedByFile)) {
|
|
222
|
+
console.log(`š ${file}:`);
|
|
223
|
+
for (const finding of fileFindings) {
|
|
224
|
+
console.log(` Line ${finding.line}: ${finding.type}`);
|
|
225
|
+
console.log(` ${finding.description}`);
|
|
226
|
+
console.log(` "${finding.content}"`);
|
|
227
|
+
console.log('');
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
console.log('š Security Recommendations:\n');
|
|
232
|
+
console.log('1. Remove the secret from your code');
|
|
233
|
+
console.log('2. Add the file to .gitignore if it contains secrets');
|
|
234
|
+
console.log('3. Use environment variables instead (see .env.example)');
|
|
235
|
+
console.log('4. If this is a false positive, add the pattern to the exclusion list\n');
|
|
236
|
+
|
|
237
|
+
console.log('š” Suggested .gitignore entries:\n');
|
|
238
|
+
const uniqueFiles = [...new Set(findings.map((f) => f.file))];
|
|
239
|
+
for (const file of uniqueFiles) {
|
|
240
|
+
console.log(` ${file}`);
|
|
241
|
+
}
|
|
242
|
+
console.log('');
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// ============================================================================
|
|
246
|
+
// Main
|
|
247
|
+
// ============================================================================
|
|
248
|
+
|
|
249
|
+
function main() {
|
|
250
|
+
console.log('š Scanning staged files for secrets...\n');
|
|
251
|
+
|
|
252
|
+
const stagedFiles = getStagedFiles();
|
|
253
|
+
|
|
254
|
+
if (stagedFiles.length === 0) {
|
|
255
|
+
console.log('ā
No files to scan\n');
|
|
256
|
+
process.exit(0);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Filter out excluded files
|
|
260
|
+
const filesToScan = stagedFiles.filter((file) => !shouldExcludeFile(file));
|
|
261
|
+
|
|
262
|
+
console.log(`Scanning ${filesToScan.length} of ${stagedFiles.length} staged files...\n`);
|
|
263
|
+
|
|
264
|
+
// Scan each file
|
|
265
|
+
const allFindings = [];
|
|
266
|
+
for (const file of filesToScan) {
|
|
267
|
+
const findings = scanFile(file);
|
|
268
|
+
allFindings.push(...findings);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (allFindings.length > 0) {
|
|
272
|
+
printFindings(allFindings);
|
|
273
|
+
console.log('ā Commit rejected due to detected secrets\n');
|
|
274
|
+
process.exit(1);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
console.log('ā
No secrets detected. Proceeding with commit.\n');
|
|
278
|
+
process.exit(0);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Run the hook
|
|
282
|
+
main();
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env tsx
|
|
2
|
+
/**
|
|
3
|
+
* Extraction Worker Runner
|
|
4
|
+
*
|
|
5
|
+
* Starts the extraction worker to process documents from the queue.
|
|
6
|
+
* Run with: npx tsx scripts/run-extraction-worker.ts
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { createExtractionWorker } from '../src/workers/extraction.worker.js';
|
|
10
|
+
import IORedis from 'ioredis';
|
|
11
|
+
import dotenv from 'dotenv';
|
|
12
|
+
|
|
13
|
+
// Load environment variables
|
|
14
|
+
dotenv.config();
|
|
15
|
+
|
|
16
|
+
// Redis connection
|
|
17
|
+
const connection = new IORedis({
|
|
18
|
+
host: process.env.REDIS_HOST || 'localhost',
|
|
19
|
+
port: parseInt(process.env.REDIS_PORT || '6379', 10),
|
|
20
|
+
maxRetriesPerRequest: null,
|
|
21
|
+
enableReadyCheck: false,
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
console.log('[ExtractionWorker] Starting worker...');
|
|
25
|
+
console.log(`[ExtractionWorker] Redis: ${connection.options.host}:${connection.options.port}`);
|
|
26
|
+
console.log(
|
|
27
|
+
`[ExtractionWorker] Concurrency: ${process.env.BULLMQ_CONCURRENCY_EXTRACTION || 5}`
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
// Create and start worker
|
|
31
|
+
const worker = createExtractionWorker(connection);
|
|
32
|
+
|
|
33
|
+
// Handle shutdown
|
|
34
|
+
const shutdown = async () => {
|
|
35
|
+
console.log('\n[ExtractionWorker] Shutting down gracefully...');
|
|
36
|
+
await worker.close();
|
|
37
|
+
await connection.quit();
|
|
38
|
+
console.log('[ExtractionWorker] Shutdown complete');
|
|
39
|
+
process.exit(0);
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
process.on('SIGTERM', shutdown);
|
|
43
|
+
process.on('SIGINT', shutdown);
|
|
44
|
+
|
|
45
|
+
console.log('[ExtractionWorker] Worker started successfully. Waiting for jobs...');
|
|
46
|
+
console.log('[ExtractionWorker] Press Ctrl+C to stop');
|