rag-lite-ts 2.0.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +815 -808
- package/dist/cli/indexer.js +2 -38
- package/dist/cli/search.d.ts +1 -1
- package/dist/cli/search.js +118 -9
- package/dist/cli.js +77 -94
- package/dist/config.js +3 -0
- package/dist/core/database-connection-manager.js +5 -9
- package/dist/core/db.js +173 -173
- package/dist/core/ingestion.js +50 -9
- package/dist/core/lazy-dependency-loader.d.ts +3 -8
- package/dist/core/lazy-dependency-loader.js +11 -29
- package/dist/core/mode-detection-service.js +1 -1
- package/dist/core/reranking-config.d.ts +1 -1
- package/dist/core/reranking-config.js +7 -16
- package/dist/core/reranking-factory.js +3 -184
- package/dist/core/reranking-strategies.js +5 -4
- package/dist/core/search.d.ts +10 -0
- package/dist/core/search.js +34 -11
- package/dist/factories/ingestion-factory.js +3 -1
- package/dist/mcp-server.js +147 -120
- package/dist/multimodal/clip-embedder.js +70 -71
- package/package.json +105 -105
package/dist/core/db.js
CHANGED
|
@@ -97,112 +97,112 @@ function enhanceSQLiteError(error, sql) {
|
|
|
97
97
|
export async function initializeSchema(connection) {
|
|
98
98
|
try {
|
|
99
99
|
// Create documents table with content type support and content_id reference
|
|
100
|
-
await connection.run(`
|
|
101
|
-
CREATE TABLE IF NOT EXISTS documents (
|
|
102
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
103
|
-
content_id TEXT, -- References content_metadata.id
|
|
104
|
-
source TEXT NOT NULL UNIQUE,
|
|
105
|
-
title TEXT NOT NULL,
|
|
106
|
-
content_type TEXT DEFAULT 'text',
|
|
107
|
-
metadata TEXT,
|
|
108
|
-
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
109
|
-
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
110
|
-
FOREIGN KEY (content_id) REFERENCES content_metadata(id)
|
|
111
|
-
)
|
|
100
|
+
await connection.run(`
|
|
101
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
102
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
103
|
+
content_id TEXT, -- References content_metadata.id
|
|
104
|
+
source TEXT NOT NULL UNIQUE,
|
|
105
|
+
title TEXT NOT NULL,
|
|
106
|
+
content_type TEXT DEFAULT 'text',
|
|
107
|
+
metadata TEXT,
|
|
108
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
109
|
+
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
110
|
+
FOREIGN KEY (content_id) REFERENCES content_metadata(id)
|
|
111
|
+
)
|
|
112
112
|
`);
|
|
113
113
|
// Create chunks table with content type and metadata support
|
|
114
|
-
await connection.run(`
|
|
115
|
-
CREATE TABLE IF NOT EXISTS chunks (
|
|
116
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
117
|
-
embedding_id TEXT NOT NULL UNIQUE,
|
|
118
|
-
document_id INTEGER NOT NULL,
|
|
119
|
-
content TEXT NOT NULL,
|
|
120
|
-
content_type TEXT DEFAULT 'text',
|
|
121
|
-
chunk_index INTEGER NOT NULL,
|
|
122
|
-
metadata TEXT,
|
|
123
|
-
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
124
|
-
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
|
|
125
|
-
)
|
|
114
|
+
await connection.run(`
|
|
115
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
116
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
117
|
+
embedding_id TEXT NOT NULL UNIQUE,
|
|
118
|
+
document_id INTEGER NOT NULL,
|
|
119
|
+
content TEXT NOT NULL,
|
|
120
|
+
content_type TEXT DEFAULT 'text',
|
|
121
|
+
chunk_index INTEGER NOT NULL,
|
|
122
|
+
metadata TEXT,
|
|
123
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
124
|
+
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
|
|
125
|
+
)
|
|
126
126
|
`);
|
|
127
127
|
// Create content_metadata table for unified content system
|
|
128
|
-
await connection.run(`
|
|
129
|
-
CREATE TABLE IF NOT EXISTS content_metadata (
|
|
130
|
-
id TEXT PRIMARY KEY, -- Hash-based content ID
|
|
131
|
-
storage_type TEXT NOT NULL CHECK (storage_type IN ('filesystem', 'content_dir')),
|
|
132
|
-
original_path TEXT, -- Original file path (filesystem only)
|
|
133
|
-
content_path TEXT NOT NULL, -- Actual storage path
|
|
134
|
-
display_name TEXT NOT NULL, -- User-friendly name
|
|
135
|
-
content_type TEXT NOT NULL, -- MIME type
|
|
136
|
-
file_size INTEGER NOT NULL, -- Size in bytes
|
|
137
|
-
content_hash TEXT NOT NULL, -- SHA-256 hash
|
|
138
|
-
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
139
|
-
)
|
|
128
|
+
await connection.run(`
|
|
129
|
+
CREATE TABLE IF NOT EXISTS content_metadata (
|
|
130
|
+
id TEXT PRIMARY KEY, -- Hash-based content ID
|
|
131
|
+
storage_type TEXT NOT NULL CHECK (storage_type IN ('filesystem', 'content_dir')),
|
|
132
|
+
original_path TEXT, -- Original file path (filesystem only)
|
|
133
|
+
content_path TEXT NOT NULL, -- Actual storage path
|
|
134
|
+
display_name TEXT NOT NULL, -- User-friendly name
|
|
135
|
+
content_type TEXT NOT NULL, -- MIME type
|
|
136
|
+
file_size INTEGER NOT NULL, -- Size in bytes
|
|
137
|
+
content_hash TEXT NOT NULL, -- SHA-256 hash
|
|
138
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
139
|
+
)
|
|
140
140
|
`);
|
|
141
141
|
// Create storage_stats table for basic content directory tracking
|
|
142
|
-
await connection.run(`
|
|
143
|
-
CREATE TABLE IF NOT EXISTS storage_stats (
|
|
144
|
-
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
145
|
-
content_dir_files INTEGER DEFAULT 0,
|
|
146
|
-
content_dir_size INTEGER DEFAULT 0,
|
|
147
|
-
filesystem_refs INTEGER DEFAULT 0,
|
|
148
|
-
last_cleanup DATETIME,
|
|
149
|
-
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
150
|
-
)
|
|
142
|
+
await connection.run(`
|
|
143
|
+
CREATE TABLE IF NOT EXISTS storage_stats (
|
|
144
|
+
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
145
|
+
content_dir_files INTEGER DEFAULT 0,
|
|
146
|
+
content_dir_size INTEGER DEFAULT 0,
|
|
147
|
+
filesystem_refs INTEGER DEFAULT 0,
|
|
148
|
+
last_cleanup DATETIME,
|
|
149
|
+
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
150
|
+
)
|
|
151
151
|
`);
|
|
152
152
|
// Create system_info table for mode persistence and model tracking
|
|
153
|
-
await connection.run(`
|
|
154
|
-
CREATE TABLE IF NOT EXISTS system_info (
|
|
155
|
-
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
156
|
-
|
|
157
|
-
-- Core mode and model information
|
|
158
|
-
mode TEXT NOT NULL DEFAULT 'text' CHECK (mode IN ('text', 'multimodal')),
|
|
159
|
-
model_name TEXT NOT NULL DEFAULT 'sentence-transformers/all-MiniLM-L6-v2',
|
|
160
|
-
model_type TEXT NOT NULL DEFAULT 'sentence-transformer' CHECK (model_type IN ('sentence-transformer', 'clip')),
|
|
161
|
-
model_dimensions INTEGER NOT NULL DEFAULT 384,
|
|
162
|
-
model_version TEXT NOT NULL DEFAULT '',
|
|
163
|
-
|
|
164
|
-
-- Content type support (JSON array)
|
|
165
|
-
supported_content_types TEXT NOT NULL DEFAULT '["text"]',
|
|
166
|
-
|
|
167
|
-
-- Reranking configuration
|
|
168
|
-
reranking_strategy TEXT DEFAULT 'cross-encoder' CHECK (
|
|
169
|
-
reranking_strategy IN ('cross-encoder', 'text-derived', '
|
|
170
|
-
),
|
|
171
|
-
reranking_model TEXT,
|
|
172
|
-
reranking_config TEXT, -- JSON configuration for strategy-specific settings
|
|
173
|
-
|
|
174
|
-
-- Timestamps
|
|
175
|
-
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
176
|
-
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
177
|
-
)
|
|
153
|
+
await connection.run(`
|
|
154
|
+
CREATE TABLE IF NOT EXISTS system_info (
|
|
155
|
+
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
156
|
+
|
|
157
|
+
-- Core mode and model information
|
|
158
|
+
mode TEXT NOT NULL DEFAULT 'text' CHECK (mode IN ('text', 'multimodal')),
|
|
159
|
+
model_name TEXT NOT NULL DEFAULT 'sentence-transformers/all-MiniLM-L6-v2',
|
|
160
|
+
model_type TEXT NOT NULL DEFAULT 'sentence-transformer' CHECK (model_type IN ('sentence-transformer', 'clip')),
|
|
161
|
+
model_dimensions INTEGER NOT NULL DEFAULT 384,
|
|
162
|
+
model_version TEXT NOT NULL DEFAULT '',
|
|
163
|
+
|
|
164
|
+
-- Content type support (JSON array)
|
|
165
|
+
supported_content_types TEXT NOT NULL DEFAULT '["text"]',
|
|
166
|
+
|
|
167
|
+
-- Reranking configuration
|
|
168
|
+
reranking_strategy TEXT DEFAULT 'cross-encoder' CHECK (
|
|
169
|
+
reranking_strategy IN ('cross-encoder', 'text-derived', 'disabled')
|
|
170
|
+
),
|
|
171
|
+
reranking_model TEXT,
|
|
172
|
+
reranking_config TEXT, -- JSON configuration for strategy-specific settings
|
|
173
|
+
|
|
174
|
+
-- Timestamps
|
|
175
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
176
|
+
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
177
|
+
)
|
|
178
178
|
`);
|
|
179
179
|
// Clean slate approach - no migration logic needed
|
|
180
180
|
// Users will perform fresh ingestion with the new architecture
|
|
181
181
|
// Create indexes for performance
|
|
182
|
-
await connection.run(`
|
|
183
|
-
CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)
|
|
182
|
+
await connection.run(`
|
|
183
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)
|
|
184
184
|
`);
|
|
185
|
-
await connection.run(`
|
|
186
|
-
CREATE INDEX IF NOT EXISTS idx_chunks_embedding_id ON chunks(embedding_id)
|
|
185
|
+
await connection.run(`
|
|
186
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_embedding_id ON chunks(embedding_id)
|
|
187
187
|
`);
|
|
188
|
-
await connection.run(`
|
|
189
|
-
CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)
|
|
188
|
+
await connection.run(`
|
|
189
|
+
CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)
|
|
190
190
|
`);
|
|
191
|
-
await connection.run(`
|
|
192
|
-
CREATE INDEX IF NOT EXISTS idx_chunks_content_type ON chunks(content_type)
|
|
191
|
+
await connection.run(`
|
|
192
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_content_type ON chunks(content_type)
|
|
193
193
|
`);
|
|
194
|
-
await connection.run(`
|
|
195
|
-
CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type)
|
|
194
|
+
await connection.run(`
|
|
195
|
+
CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type)
|
|
196
196
|
`);
|
|
197
|
-
await connection.run(`
|
|
198
|
-
CREATE INDEX IF NOT EXISTS idx_documents_content_id ON documents(content_id)
|
|
197
|
+
await connection.run(`
|
|
198
|
+
CREATE INDEX IF NOT EXISTS idx_documents_content_id ON documents(content_id)
|
|
199
199
|
`);
|
|
200
200
|
// Create indexes for content metadata table for efficient lookup
|
|
201
|
-
await connection.run(`
|
|
202
|
-
CREATE INDEX IF NOT EXISTS idx_content_hash ON content_metadata(content_hash)
|
|
201
|
+
await connection.run(`
|
|
202
|
+
CREATE INDEX IF NOT EXISTS idx_content_hash ON content_metadata(content_hash)
|
|
203
203
|
`);
|
|
204
|
-
await connection.run(`
|
|
205
|
-
CREATE INDEX IF NOT EXISTS idx_storage_type ON content_metadata(storage_type)
|
|
204
|
+
await connection.run(`
|
|
205
|
+
CREATE INDEX IF NOT EXISTS idx_storage_type ON content_metadata(storage_type)
|
|
206
206
|
`);
|
|
207
207
|
console.log('Database schema initialized successfully');
|
|
208
208
|
}
|
|
@@ -308,24 +308,24 @@ export async function getChunksByEmbeddingIds(connection, embeddingIds) {
|
|
|
308
308
|
}
|
|
309
309
|
try {
|
|
310
310
|
const placeholders = embeddingIds.map(() => '?').join(',');
|
|
311
|
-
const sql = `
|
|
312
|
-
SELECT
|
|
313
|
-
c.id,
|
|
314
|
-
c.embedding_id,
|
|
315
|
-
c.document_id,
|
|
316
|
-
c.content,
|
|
317
|
-
c.content_type,
|
|
318
|
-
c.chunk_index,
|
|
319
|
-
c.metadata,
|
|
320
|
-
c.created_at,
|
|
321
|
-
d.source as document_source,
|
|
322
|
-
d.title as document_title,
|
|
323
|
-
d.content_type as document_content_type,
|
|
324
|
-
d.content_id as document_content_id
|
|
325
|
-
FROM chunks c
|
|
326
|
-
JOIN documents d ON c.document_id = d.id
|
|
327
|
-
WHERE c.embedding_id IN (${placeholders})
|
|
328
|
-
ORDER BY c.chunk_index
|
|
311
|
+
const sql = `
|
|
312
|
+
SELECT
|
|
313
|
+
c.id,
|
|
314
|
+
c.embedding_id,
|
|
315
|
+
c.document_id,
|
|
316
|
+
c.content,
|
|
317
|
+
c.content_type,
|
|
318
|
+
c.chunk_index,
|
|
319
|
+
c.metadata,
|
|
320
|
+
c.created_at,
|
|
321
|
+
d.source as document_source,
|
|
322
|
+
d.title as document_title,
|
|
323
|
+
d.content_type as document_content_type,
|
|
324
|
+
d.content_id as document_content_id
|
|
325
|
+
FROM chunks c
|
|
326
|
+
JOIN documents d ON c.document_id = d.id
|
|
327
|
+
WHERE c.embedding_id IN (${placeholders})
|
|
328
|
+
ORDER BY c.chunk_index
|
|
329
329
|
`;
|
|
330
330
|
const results = await connection.all(sql, embeddingIds);
|
|
331
331
|
// Parse metadata JSON strings back to objects
|
|
@@ -381,12 +381,12 @@ function validateContentType(contentType) {
|
|
|
381
381
|
*/
|
|
382
382
|
export async function getSystemInfo(connection) {
|
|
383
383
|
try {
|
|
384
|
-
const result = await connection.get(`
|
|
385
|
-
SELECT
|
|
386
|
-
mode, model_name, model_type, model_dimensions, model_version,
|
|
387
|
-
supported_content_types, reranking_strategy, reranking_model,
|
|
388
|
-
reranking_config, created_at, updated_at
|
|
389
|
-
FROM system_info WHERE id = 1
|
|
384
|
+
const result = await connection.get(`
|
|
385
|
+
SELECT
|
|
386
|
+
mode, model_name, model_type, model_dimensions, model_version,
|
|
387
|
+
supported_content_types, reranking_strategy, reranking_model,
|
|
388
|
+
reranking_config, created_at, updated_at
|
|
389
|
+
FROM system_info WHERE id = 1
|
|
390
390
|
`);
|
|
391
391
|
if (!result) {
|
|
392
392
|
return null;
|
|
@@ -492,12 +492,12 @@ export async function setSystemInfo(connection, systemInfo) {
|
|
|
492
492
|
}
|
|
493
493
|
else {
|
|
494
494
|
// Insert new row with provided values and defaults
|
|
495
|
-
const insertSql = `
|
|
496
|
-
INSERT INTO system_info (
|
|
497
|
-
id, mode, model_name, model_type, model_dimensions, model_version,
|
|
498
|
-
supported_content_types, reranking_strategy, reranking_model, reranking_config,
|
|
499
|
-
created_at, updated_at
|
|
500
|
-
) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
495
|
+
const insertSql = `
|
|
496
|
+
INSERT INTO system_info (
|
|
497
|
+
id, mode, model_name, model_type, model_dimensions, model_version,
|
|
498
|
+
supported_content_types, reranking_strategy, reranking_model, reranking_config,
|
|
499
|
+
created_at, updated_at
|
|
500
|
+
) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
501
501
|
`;
|
|
502
502
|
await connection.run(insertSql, [
|
|
503
503
|
systemInfo.mode || 'text',
|
|
@@ -556,24 +556,24 @@ export async function getDocumentsByContentType(connection, contentType) {
|
|
|
556
556
|
export async function getChunksByContentType(connection, contentType) {
|
|
557
557
|
try {
|
|
558
558
|
validateContentType(contentType);
|
|
559
|
-
const sql = `
|
|
560
|
-
SELECT
|
|
561
|
-
c.id,
|
|
562
|
-
c.embedding_id,
|
|
563
|
-
c.document_id,
|
|
564
|
-
c.content,
|
|
565
|
-
c.content_type,
|
|
566
|
-
c.chunk_index,
|
|
567
|
-
c.metadata,
|
|
568
|
-
c.created_at,
|
|
569
|
-
d.source as document_source,
|
|
570
|
-
d.title as document_title,
|
|
571
|
-
d.content_type as document_content_type,
|
|
572
|
-
d.content_id as document_content_id
|
|
573
|
-
FROM chunks c
|
|
574
|
-
JOIN documents d ON c.document_id = d.id
|
|
575
|
-
WHERE c.content_type = ?
|
|
576
|
-
ORDER BY d.source, c.chunk_index
|
|
559
|
+
const sql = `
|
|
560
|
+
SELECT
|
|
561
|
+
c.id,
|
|
562
|
+
c.embedding_id,
|
|
563
|
+
c.document_id,
|
|
564
|
+
c.content,
|
|
565
|
+
c.content_type,
|
|
566
|
+
c.chunk_index,
|
|
567
|
+
c.metadata,
|
|
568
|
+
c.created_at,
|
|
569
|
+
d.source as document_source,
|
|
570
|
+
d.title as document_title,
|
|
571
|
+
d.content_type as document_content_type,
|
|
572
|
+
d.content_id as document_content_id
|
|
573
|
+
FROM chunks c
|
|
574
|
+
JOIN documents d ON c.document_id = d.id
|
|
575
|
+
WHERE c.content_type = ?
|
|
576
|
+
ORDER BY d.source, c.chunk_index
|
|
577
577
|
`;
|
|
578
578
|
const results = await connection.all(sql, [contentType]);
|
|
579
579
|
// Parse metadata JSON strings back to objects
|
|
@@ -594,16 +594,16 @@ export async function getChunksByContentType(connection, contentType) {
|
|
|
594
594
|
export async function getContentTypeStatistics(connection) {
|
|
595
595
|
try {
|
|
596
596
|
// Get document statistics
|
|
597
|
-
const docStats = await connection.all(`
|
|
598
|
-
SELECT content_type, COUNT(*) as count
|
|
599
|
-
FROM documents
|
|
600
|
-
GROUP BY content_type
|
|
597
|
+
const docStats = await connection.all(`
|
|
598
|
+
SELECT content_type, COUNT(*) as count
|
|
599
|
+
FROM documents
|
|
600
|
+
GROUP BY content_type
|
|
601
601
|
`);
|
|
602
602
|
// Get chunk statistics
|
|
603
|
-
const chunkStats = await connection.all(`
|
|
604
|
-
SELECT content_type, COUNT(*) as count
|
|
605
|
-
FROM chunks
|
|
606
|
-
GROUP BY content_type
|
|
603
|
+
const chunkStats = await connection.all(`
|
|
604
|
+
SELECT content_type, COUNT(*) as count
|
|
605
|
+
FROM chunks
|
|
606
|
+
GROUP BY content_type
|
|
607
607
|
`);
|
|
608
608
|
// Get totals
|
|
609
609
|
const totalDocs = await connection.get('SELECT COUNT(*) as count FROM documents');
|
|
@@ -672,11 +672,11 @@ export async function updateChunkMetadata(connection, chunkId, metadata) {
|
|
|
672
672
|
*/
|
|
673
673
|
export async function insertContentMetadata(connection, contentMetadata) {
|
|
674
674
|
try {
|
|
675
|
-
await connection.run(`
|
|
676
|
-
INSERT INTO content_metadata (
|
|
677
|
-
id, storage_type, original_path, content_path, display_name,
|
|
678
|
-
content_type, file_size, content_hash
|
|
679
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
675
|
+
await connection.run(`
|
|
676
|
+
INSERT INTO content_metadata (
|
|
677
|
+
id, storage_type, original_path, content_path, display_name,
|
|
678
|
+
content_type, file_size, content_hash
|
|
679
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
680
680
|
`, [
|
|
681
681
|
contentMetadata.id,
|
|
682
682
|
contentMetadata.storageType,
|
|
@@ -703,11 +703,11 @@ export async function insertContentMetadata(connection, contentMetadata) {
|
|
|
703
703
|
*/
|
|
704
704
|
export async function getContentMetadata(connection, contentId) {
|
|
705
705
|
try {
|
|
706
|
-
const result = await connection.get(`
|
|
707
|
-
SELECT id, storage_type, original_path, content_path, display_name,
|
|
708
|
-
content_type, file_size, content_hash, created_at
|
|
709
|
-
FROM content_metadata
|
|
710
|
-
WHERE id = ?
|
|
706
|
+
const result = await connection.get(`
|
|
707
|
+
SELECT id, storage_type, original_path, content_path, display_name,
|
|
708
|
+
content_type, file_size, content_hash, created_at
|
|
709
|
+
FROM content_metadata
|
|
710
|
+
WHERE id = ?
|
|
711
711
|
`, [contentId]);
|
|
712
712
|
if (!result) {
|
|
713
713
|
return null;
|
|
@@ -736,11 +736,11 @@ export async function getContentMetadata(connection, contentId) {
|
|
|
736
736
|
*/
|
|
737
737
|
export async function getContentMetadataByHash(connection, contentHash) {
|
|
738
738
|
try {
|
|
739
|
-
const result = await connection.get(`
|
|
740
|
-
SELECT id, storage_type, original_path, content_path, display_name,
|
|
741
|
-
content_type, file_size, content_hash, created_at
|
|
742
|
-
FROM content_metadata
|
|
743
|
-
WHERE content_hash = ?
|
|
739
|
+
const result = await connection.get(`
|
|
740
|
+
SELECT id, storage_type, original_path, content_path, display_name,
|
|
741
|
+
content_type, file_size, content_hash, created_at
|
|
742
|
+
FROM content_metadata
|
|
743
|
+
WHERE content_hash = ?
|
|
744
744
|
`, [contentHash]);
|
|
745
745
|
if (!result) {
|
|
746
746
|
return null;
|
|
@@ -769,12 +769,12 @@ export async function getContentMetadataByHash(connection, contentHash) {
|
|
|
769
769
|
*/
|
|
770
770
|
export async function getContentMetadataByStorageType(connection, storageType) {
|
|
771
771
|
try {
|
|
772
|
-
const results = await connection.all(`
|
|
773
|
-
SELECT id, storage_type, original_path, content_path, display_name,
|
|
774
|
-
content_type, file_size, content_hash, created_at
|
|
775
|
-
FROM content_metadata
|
|
776
|
-
WHERE storage_type = ?
|
|
777
|
-
ORDER BY created_at DESC
|
|
772
|
+
const results = await connection.all(`
|
|
773
|
+
SELECT id, storage_type, original_path, content_path, display_name,
|
|
774
|
+
content_type, file_size, content_hash, created_at
|
|
775
|
+
FROM content_metadata
|
|
776
|
+
WHERE storage_type = ?
|
|
777
|
+
ORDER BY created_at DESC
|
|
778
778
|
`, [storageType]);
|
|
779
779
|
return results.map((result) => ({
|
|
780
780
|
id: result.id,
|
|
@@ -814,11 +814,11 @@ export async function deleteContentMetadata(connection, contentId) {
|
|
|
814
814
|
*/
|
|
815
815
|
export async function getStorageStats(connection) {
|
|
816
816
|
try {
|
|
817
|
-
const result = await connection.get(`
|
|
818
|
-
SELECT content_dir_files, content_dir_size, filesystem_refs,
|
|
819
|
-
last_cleanup, updated_at
|
|
820
|
-
FROM storage_stats
|
|
821
|
-
WHERE id = 1
|
|
817
|
+
const result = await connection.get(`
|
|
818
|
+
SELECT content_dir_files, content_dir_size, filesystem_refs,
|
|
819
|
+
last_cleanup, updated_at
|
|
820
|
+
FROM storage_stats
|
|
821
|
+
WHERE id = 1
|
|
822
822
|
`);
|
|
823
823
|
if (!result) {
|
|
824
824
|
return null;
|
|
@@ -874,11 +874,11 @@ export async function updateStorageStats(connection, stats) {
|
|
|
874
874
|
}
|
|
875
875
|
else {
|
|
876
876
|
// Insert new row with provided values and defaults
|
|
877
|
-
const insertSql = `
|
|
878
|
-
INSERT INTO storage_stats (
|
|
879
|
-
id, content_dir_files, content_dir_size, filesystem_refs,
|
|
880
|
-
last_cleanup, updated_at
|
|
881
|
-
) VALUES (1, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
877
|
+
const insertSql = `
|
|
878
|
+
INSERT INTO storage_stats (
|
|
879
|
+
id, content_dir_files, content_dir_size, filesystem_refs,
|
|
880
|
+
last_cleanup, updated_at
|
|
881
|
+
) VALUES (1, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
882
882
|
`;
|
|
883
883
|
await connection.run(insertSql, [
|
|
884
884
|
stats.contentDirFiles || 0,
|
package/dist/core/ingestion.js
CHANGED
|
@@ -9,6 +9,9 @@ import { config } from './config.js';
|
|
|
9
9
|
import { DocumentPathManager } from './path-manager.js';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import { ContentManager } from './content-manager.js';
|
|
12
|
+
import { createRequire } from 'module';
|
|
13
|
+
// Create require for CommonJS modules in ES module context
|
|
14
|
+
const require = createRequire(import.meta.url);
|
|
12
15
|
/**
|
|
13
16
|
* Main ingestion pipeline class
|
|
14
17
|
* Coordinates the entire process from file discovery to vector storage
|
|
@@ -198,7 +201,23 @@ export class IngestionPipeline {
|
|
|
198
201
|
try {
|
|
199
202
|
// Convert MIME type to simple content type for embedding function
|
|
200
203
|
const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
|
|
201
|
-
|
|
204
|
+
// For images, use the image path from metadata instead of text description
|
|
205
|
+
let contentForEmbedding = chunk.text;
|
|
206
|
+
if (contentTypeForEmbedding === 'image' && document.metadata) {
|
|
207
|
+
// Try to get image path from metadata (contentPath, originalPath, or source)
|
|
208
|
+
// contentPath is where the image is stored (from contentResult)
|
|
209
|
+
const imagePath = document.metadata.contentPath ||
|
|
210
|
+
document.metadata.originalPath ||
|
|
211
|
+
document.metadata.source;
|
|
212
|
+
if (imagePath) {
|
|
213
|
+
contentForEmbedding = imagePath;
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
// Fallback: try to extract path from source if available
|
|
217
|
+
console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
|
|
202
221
|
// Enhance embedding result with content type metadata
|
|
203
222
|
if (!embedding.contentType) {
|
|
204
223
|
embedding.contentType = contentTypeForEmbedding;
|
|
@@ -444,7 +463,20 @@ export class IngestionPipeline {
|
|
|
444
463
|
try {
|
|
445
464
|
// Convert MIME type to simple content type for embedding function
|
|
446
465
|
const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
|
|
447
|
-
|
|
466
|
+
// For images, use the image path from metadata instead of text description
|
|
467
|
+
let contentForEmbedding = chunk.text;
|
|
468
|
+
if (contentTypeForEmbedding === 'image' && chunk.metadata) {
|
|
469
|
+
// Try to get image path from metadata (originalPath or contentPath)
|
|
470
|
+
const imagePath = chunk.metadata.originalPath || chunk.metadata.contentPath || chunk.metadata.source;
|
|
471
|
+
if (imagePath) {
|
|
472
|
+
contentForEmbedding = imagePath;
|
|
473
|
+
}
|
|
474
|
+
else {
|
|
475
|
+
// Fallback: try to extract path from source if available
|
|
476
|
+
console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
|
|
448
480
|
// Enhance embedding result with content type metadata if not already present
|
|
449
481
|
if (!embedding.contentType) {
|
|
450
482
|
embedding.contentType = contentTypeForEmbedding;
|
|
@@ -585,21 +617,28 @@ export class IngestionPipeline {
|
|
|
585
617
|
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
586
618
|
* @returns Simple content type ('text', 'image', etc.)
|
|
587
619
|
*/
|
|
588
|
-
getContentTypeForEmbedding(
|
|
589
|
-
if (!
|
|
620
|
+
getContentTypeForEmbedding(contentType) {
|
|
621
|
+
if (!contentType) {
|
|
622
|
+
return 'text';
|
|
623
|
+
}
|
|
624
|
+
// Handle simple content type strings (used by chunking)
|
|
625
|
+
if (contentType === 'image') {
|
|
626
|
+
return 'image';
|
|
627
|
+
}
|
|
628
|
+
else if (contentType === 'text') {
|
|
590
629
|
return 'text';
|
|
591
630
|
}
|
|
592
|
-
// Convert MIME types to simple content types
|
|
593
|
-
if (
|
|
631
|
+
// Convert MIME types to simple content types (legacy support)
|
|
632
|
+
if (contentType.startsWith('text/')) {
|
|
594
633
|
return 'text';
|
|
595
634
|
}
|
|
596
|
-
else if (
|
|
635
|
+
else if (contentType.startsWith('image/')) {
|
|
597
636
|
return 'image';
|
|
598
637
|
}
|
|
599
|
-
else if (
|
|
638
|
+
else if (contentType === 'application/pdf') {
|
|
600
639
|
return 'text'; // PDFs are processed as text
|
|
601
640
|
}
|
|
602
|
-
else if (
|
|
641
|
+
else if (contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
603
642
|
return 'text'; // DOCX files are processed as text
|
|
604
643
|
}
|
|
605
644
|
else {
|
|
@@ -668,6 +707,7 @@ export class IngestionPipeline {
|
|
|
668
707
|
contentType: 'image',
|
|
669
708
|
contentId: contentResult.contentId,
|
|
670
709
|
storageType: contentResult.storageType,
|
|
710
|
+
contentPath: contentResult.contentPath, // Store contentPath for embedding
|
|
671
711
|
originalPath: metadata.originalPath,
|
|
672
712
|
...imageMetadata // Spread all image metadata fields
|
|
673
713
|
}
|
|
@@ -684,6 +724,7 @@ export class IngestionPipeline {
|
|
|
684
724
|
contentType: 'image',
|
|
685
725
|
contentId: contentResult.contentId,
|
|
686
726
|
storageType: contentResult.storageType,
|
|
727
|
+
contentPath: contentResult.contentPath, // Store contentPath for embedding
|
|
687
728
|
originalPath: metadata.originalPath,
|
|
688
729
|
processingError: error instanceof Error ? error.message : String(error)
|
|
689
730
|
}
|
|
@@ -59,15 +59,10 @@ export declare class LazyRerankerLoader {
|
|
|
59
59
|
*/
|
|
60
60
|
static loadTextDerivedReranker(): Promise<RerankFunction>;
|
|
61
61
|
/**
|
|
62
|
-
* Lazily load
|
|
63
|
-
*
|
|
62
|
+
* Lazily load CLIP AutoProcessor for consistent image preprocessing
|
|
63
|
+
* Shares processor instances across embedder instances to ensure identical preprocessing
|
|
64
64
|
*/
|
|
65
|
-
static
|
|
66
|
-
/**
|
|
67
|
-
* Lazily load hybrid reranker for multimodal mode
|
|
68
|
-
* Combines multiple reranking strategies (uses text-derived for now)
|
|
69
|
-
*/
|
|
70
|
-
static loadHybridReranker(): Promise<RerankFunction>;
|
|
65
|
+
static loadCLIPAutoProcessor(modelName: string): Promise<any>;
|
|
71
66
|
/**
|
|
72
67
|
* Check if a reranker is already loaded in cache
|
|
73
68
|
*/
|
|
@@ -198,32 +198,18 @@ export class LazyRerankerLoader {
|
|
|
198
198
|
});
|
|
199
199
|
}
|
|
200
200
|
/**
|
|
201
|
-
* Lazily load
|
|
202
|
-
*
|
|
201
|
+
* Lazily load CLIP AutoProcessor for consistent image preprocessing
|
|
202
|
+
* Shares processor instances across embedder instances to ensure identical preprocessing
|
|
203
203
|
*/
|
|
204
|
-
static async
|
|
205
|
-
const cacheKey =
|
|
204
|
+
static async loadCLIPAutoProcessor(modelName) {
|
|
205
|
+
const cacheKey = `processor:clip:${modelName}`;
|
|
206
206
|
return this.cache.getOrLoad(cacheKey, async () => {
|
|
207
|
-
console.log(
|
|
208
|
-
// Dynamic import - only loaded when
|
|
209
|
-
const {
|
|
210
|
-
const
|
|
211
|
-
console.log(
|
|
212
|
-
return
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
/**
|
|
216
|
-
* Lazily load hybrid reranker for multimodal mode
|
|
217
|
-
* Combines multiple reranking strategies (uses text-derived for now)
|
|
218
|
-
*/
|
|
219
|
-
static async loadHybridReranker() {
|
|
220
|
-
const cacheKey = 'reranker:hybrid';
|
|
221
|
-
return this.cache.getOrLoad(cacheKey, async () => {
|
|
222
|
-
console.log('🔄 Lazy loading hybrid reranker (multimodal)');
|
|
223
|
-
// For now, hybrid reranking uses text-derived
|
|
224
|
-
// TODO: Implement proper hybrid reranking in future tasks
|
|
225
|
-
console.log('🔄 Hybrid reranking not yet implemented, using text-derived');
|
|
226
|
-
return this.loadTextDerivedReranker();
|
|
207
|
+
console.log(`🔄 Lazy loading CLIP AutoProcessor: ${modelName}`);
|
|
208
|
+
// Dynamic import - only loaded when CLIP models are used
|
|
209
|
+
const { AutoProcessor } = await import('@huggingface/transformers');
|
|
210
|
+
const processor = await AutoProcessor.from_pretrained(modelName);
|
|
211
|
+
console.log(`✅ CLIP AutoProcessor loaded: ${modelName}`);
|
|
212
|
+
return processor;
|
|
227
213
|
});
|
|
228
214
|
}
|
|
229
215
|
/**
|
|
@@ -371,12 +357,8 @@ export class LazyDependencyManager {
|
|
|
371
357
|
return LazyRerankerLoader.loadTextReranker();
|
|
372
358
|
case 'text-derived':
|
|
373
359
|
return LazyRerankerLoader.loadTextDerivedReranker();
|
|
374
|
-
case 'metadata':
|
|
375
|
-
return LazyRerankerLoader.loadMetadataReranker();
|
|
376
|
-
case 'hybrid':
|
|
377
|
-
return LazyRerankerLoader.loadHybridReranker();
|
|
378
360
|
default:
|
|
379
|
-
throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived,
|
|
361
|
+
throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived, disabled`);
|
|
380
362
|
}
|
|
381
363
|
}
|
|
382
364
|
/**
|
|
@@ -526,7 +526,7 @@ export class ModeDetectionService {
|
|
|
526
526
|
* @private
|
|
527
527
|
*/
|
|
528
528
|
validateRerankingStrategy(strategy) {
|
|
529
|
-
const validStrategies = ['cross-encoder', 'text-derived', '
|
|
529
|
+
const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
|
|
530
530
|
if (!validStrategies.includes(strategy)) {
|
|
531
531
|
throw createError.validation(`Invalid reranking strategy '${strategy}'. Must be one of: ${validStrategies.join(', ')}`);
|
|
532
532
|
}
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Provides straightforward configuration types and validation for different
|
|
5
5
|
* reranking strategies without complex interface patterns.
|
|
6
6
|
*/
|
|
7
|
-
export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | '
|
|
7
|
+
export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | 'disabled';
|
|
8
8
|
export interface RerankingConfig {
|
|
9
9
|
strategy: RerankingStrategyType;
|
|
10
10
|
model?: string;
|