rag-lite-ts 2.0.5 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/core/db.js CHANGED
@@ -97,112 +97,112 @@ function enhanceSQLiteError(error, sql) {
97
97
  export async function initializeSchema(connection) {
98
98
  try {
99
99
  // Create documents table with content type support and content_id reference
100
- await connection.run(`
101
- CREATE TABLE IF NOT EXISTS documents (
102
- id INTEGER PRIMARY KEY AUTOINCREMENT,
103
- content_id TEXT, -- References content_metadata.id
104
- source TEXT NOT NULL UNIQUE,
105
- title TEXT NOT NULL,
106
- content_type TEXT DEFAULT 'text',
107
- metadata TEXT,
108
- created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
109
- updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
110
- FOREIGN KEY (content_id) REFERENCES content_metadata(id)
111
- )
100
+ await connection.run(`
101
+ CREATE TABLE IF NOT EXISTS documents (
102
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
103
+ content_id TEXT, -- References content_metadata.id
104
+ source TEXT NOT NULL UNIQUE,
105
+ title TEXT NOT NULL,
106
+ content_type TEXT DEFAULT 'text',
107
+ metadata TEXT,
108
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
109
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
110
+ FOREIGN KEY (content_id) REFERENCES content_metadata(id)
111
+ )
112
112
  `);
113
113
  // Create chunks table with content type and metadata support
114
- await connection.run(`
115
- CREATE TABLE IF NOT EXISTS chunks (
116
- id INTEGER PRIMARY KEY AUTOINCREMENT,
117
- embedding_id TEXT NOT NULL UNIQUE,
118
- document_id INTEGER NOT NULL,
119
- content TEXT NOT NULL,
120
- content_type TEXT DEFAULT 'text',
121
- chunk_index INTEGER NOT NULL,
122
- metadata TEXT,
123
- created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
124
- FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
125
- )
114
+ await connection.run(`
115
+ CREATE TABLE IF NOT EXISTS chunks (
116
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
117
+ embedding_id TEXT NOT NULL UNIQUE,
118
+ document_id INTEGER NOT NULL,
119
+ content TEXT NOT NULL,
120
+ content_type TEXT DEFAULT 'text',
121
+ chunk_index INTEGER NOT NULL,
122
+ metadata TEXT,
123
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
124
+ FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
125
+ )
126
126
  `);
127
127
  // Create content_metadata table for unified content system
128
- await connection.run(`
129
- CREATE TABLE IF NOT EXISTS content_metadata (
130
- id TEXT PRIMARY KEY, -- Hash-based content ID
131
- storage_type TEXT NOT NULL CHECK (storage_type IN ('filesystem', 'content_dir')),
132
- original_path TEXT, -- Original file path (filesystem only)
133
- content_path TEXT NOT NULL, -- Actual storage path
134
- display_name TEXT NOT NULL, -- User-friendly name
135
- content_type TEXT NOT NULL, -- MIME type
136
- file_size INTEGER NOT NULL, -- Size in bytes
137
- content_hash TEXT NOT NULL, -- SHA-256 hash
138
- created_at DATETIME DEFAULT CURRENT_TIMESTAMP
139
- )
128
+ await connection.run(`
129
+ CREATE TABLE IF NOT EXISTS content_metadata (
130
+ id TEXT PRIMARY KEY, -- Hash-based content ID
131
+ storage_type TEXT NOT NULL CHECK (storage_type IN ('filesystem', 'content_dir')),
132
+ original_path TEXT, -- Original file path (filesystem only)
133
+ content_path TEXT NOT NULL, -- Actual storage path
134
+ display_name TEXT NOT NULL, -- User-friendly name
135
+ content_type TEXT NOT NULL, -- MIME type
136
+ file_size INTEGER NOT NULL, -- Size in bytes
137
+ content_hash TEXT NOT NULL, -- SHA-256 hash
138
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
139
+ )
140
140
  `);
141
141
  // Create storage_stats table for basic content directory tracking
142
- await connection.run(`
143
- CREATE TABLE IF NOT EXISTS storage_stats (
144
- id INTEGER PRIMARY KEY CHECK (id = 1),
145
- content_dir_files INTEGER DEFAULT 0,
146
- content_dir_size INTEGER DEFAULT 0,
147
- filesystem_refs INTEGER DEFAULT 0,
148
- last_cleanup DATETIME,
149
- updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
150
- )
142
+ await connection.run(`
143
+ CREATE TABLE IF NOT EXISTS storage_stats (
144
+ id INTEGER PRIMARY KEY CHECK (id = 1),
145
+ content_dir_files INTEGER DEFAULT 0,
146
+ content_dir_size INTEGER DEFAULT 0,
147
+ filesystem_refs INTEGER DEFAULT 0,
148
+ last_cleanup DATETIME,
149
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
150
+ )
151
151
  `);
152
152
  // Create system_info table for mode persistence and model tracking
153
- await connection.run(`
154
- CREATE TABLE IF NOT EXISTS system_info (
155
- id INTEGER PRIMARY KEY CHECK (id = 1),
156
-
157
- -- Core mode and model information
158
- mode TEXT NOT NULL DEFAULT 'text' CHECK (mode IN ('text', 'multimodal')),
159
- model_name TEXT NOT NULL DEFAULT 'sentence-transformers/all-MiniLM-L6-v2',
160
- model_type TEXT NOT NULL DEFAULT 'sentence-transformer' CHECK (model_type IN ('sentence-transformer', 'clip')),
161
- model_dimensions INTEGER NOT NULL DEFAULT 384,
162
- model_version TEXT NOT NULL DEFAULT '',
163
-
164
- -- Content type support (JSON array)
165
- supported_content_types TEXT NOT NULL DEFAULT '["text"]',
166
-
167
- -- Reranking configuration
168
- reranking_strategy TEXT DEFAULT 'cross-encoder' CHECK (
169
- reranking_strategy IN ('cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled')
170
- ),
171
- reranking_model TEXT,
172
- reranking_config TEXT, -- JSON configuration for strategy-specific settings
173
-
174
- -- Timestamps
175
- created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
176
- updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
177
- )
153
+ await connection.run(`
154
+ CREATE TABLE IF NOT EXISTS system_info (
155
+ id INTEGER PRIMARY KEY CHECK (id = 1),
156
+
157
+ -- Core mode and model information
158
+ mode TEXT NOT NULL DEFAULT 'text' CHECK (mode IN ('text', 'multimodal')),
159
+ model_name TEXT NOT NULL DEFAULT 'sentence-transformers/all-MiniLM-L6-v2',
160
+ model_type TEXT NOT NULL DEFAULT 'sentence-transformer' CHECK (model_type IN ('sentence-transformer', 'clip')),
161
+ model_dimensions INTEGER NOT NULL DEFAULT 384,
162
+ model_version TEXT NOT NULL DEFAULT '',
163
+
164
+ -- Content type support (JSON array)
165
+ supported_content_types TEXT NOT NULL DEFAULT '["text"]',
166
+
167
+ -- Reranking configuration
168
+ reranking_strategy TEXT DEFAULT 'cross-encoder' CHECK (
169
+ reranking_strategy IN ('cross-encoder', 'text-derived', 'disabled')
170
+ ),
171
+ reranking_model TEXT,
172
+ reranking_config TEXT, -- JSON configuration for strategy-specific settings
173
+
174
+ -- Timestamps
175
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
176
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
177
+ )
178
178
  `);
179
179
  // Clean slate approach - no migration logic needed
180
180
  // Users will perform fresh ingestion with the new architecture
181
181
  // Create indexes for performance
182
- await connection.run(`
183
- CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)
182
+ await connection.run(`
183
+ CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)
184
184
  `);
185
- await connection.run(`
186
- CREATE INDEX IF NOT EXISTS idx_chunks_embedding_id ON chunks(embedding_id)
185
+ await connection.run(`
186
+ CREATE INDEX IF NOT EXISTS idx_chunks_embedding_id ON chunks(embedding_id)
187
187
  `);
188
- await connection.run(`
189
- CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)
188
+ await connection.run(`
189
+ CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)
190
190
  `);
191
- await connection.run(`
192
- CREATE INDEX IF NOT EXISTS idx_chunks_content_type ON chunks(content_type)
191
+ await connection.run(`
192
+ CREATE INDEX IF NOT EXISTS idx_chunks_content_type ON chunks(content_type)
193
193
  `);
194
- await connection.run(`
195
- CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type)
194
+ await connection.run(`
195
+ CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type)
196
196
  `);
197
- await connection.run(`
198
- CREATE INDEX IF NOT EXISTS idx_documents_content_id ON documents(content_id)
197
+ await connection.run(`
198
+ CREATE INDEX IF NOT EXISTS idx_documents_content_id ON documents(content_id)
199
199
  `);
200
200
  // Create indexes for content metadata table for efficient lookup
201
- await connection.run(`
202
- CREATE INDEX IF NOT EXISTS idx_content_hash ON content_metadata(content_hash)
201
+ await connection.run(`
202
+ CREATE INDEX IF NOT EXISTS idx_content_hash ON content_metadata(content_hash)
203
203
  `);
204
- await connection.run(`
205
- CREATE INDEX IF NOT EXISTS idx_storage_type ON content_metadata(storage_type)
204
+ await connection.run(`
205
+ CREATE INDEX IF NOT EXISTS idx_storage_type ON content_metadata(storage_type)
206
206
  `);
207
207
  console.log('Database schema initialized successfully');
208
208
  }
@@ -308,24 +308,24 @@ export async function getChunksByEmbeddingIds(connection, embeddingIds) {
308
308
  }
309
309
  try {
310
310
  const placeholders = embeddingIds.map(() => '?').join(',');
311
- const sql = `
312
- SELECT
313
- c.id,
314
- c.embedding_id,
315
- c.document_id,
316
- c.content,
317
- c.content_type,
318
- c.chunk_index,
319
- c.metadata,
320
- c.created_at,
321
- d.source as document_source,
322
- d.title as document_title,
323
- d.content_type as document_content_type,
324
- d.content_id as document_content_id
325
- FROM chunks c
326
- JOIN documents d ON c.document_id = d.id
327
- WHERE c.embedding_id IN (${placeholders})
328
- ORDER BY c.chunk_index
311
+ const sql = `
312
+ SELECT
313
+ c.id,
314
+ c.embedding_id,
315
+ c.document_id,
316
+ c.content,
317
+ c.content_type,
318
+ c.chunk_index,
319
+ c.metadata,
320
+ c.created_at,
321
+ d.source as document_source,
322
+ d.title as document_title,
323
+ d.content_type as document_content_type,
324
+ d.content_id as document_content_id
325
+ FROM chunks c
326
+ JOIN documents d ON c.document_id = d.id
327
+ WHERE c.embedding_id IN (${placeholders})
328
+ ORDER BY c.chunk_index
329
329
  `;
330
330
  const results = await connection.all(sql, embeddingIds);
331
331
  // Parse metadata JSON strings back to objects
@@ -381,12 +381,12 @@ function validateContentType(contentType) {
381
381
  */
382
382
  export async function getSystemInfo(connection) {
383
383
  try {
384
- const result = await connection.get(`
385
- SELECT
386
- mode, model_name, model_type, model_dimensions, model_version,
387
- supported_content_types, reranking_strategy, reranking_model,
388
- reranking_config, created_at, updated_at
389
- FROM system_info WHERE id = 1
384
+ const result = await connection.get(`
385
+ SELECT
386
+ mode, model_name, model_type, model_dimensions, model_version,
387
+ supported_content_types, reranking_strategy, reranking_model,
388
+ reranking_config, created_at, updated_at
389
+ FROM system_info WHERE id = 1
390
390
  `);
391
391
  if (!result) {
392
392
  return null;
@@ -492,12 +492,12 @@ export async function setSystemInfo(connection, systemInfo) {
492
492
  }
493
493
  else {
494
494
  // Insert new row with provided values and defaults
495
- const insertSql = `
496
- INSERT INTO system_info (
497
- id, mode, model_name, model_type, model_dimensions, model_version,
498
- supported_content_types, reranking_strategy, reranking_model, reranking_config,
499
- created_at, updated_at
500
- ) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
495
+ const insertSql = `
496
+ INSERT INTO system_info (
497
+ id, mode, model_name, model_type, model_dimensions, model_version,
498
+ supported_content_types, reranking_strategy, reranking_model, reranking_config,
499
+ created_at, updated_at
500
+ ) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
501
501
  `;
502
502
  await connection.run(insertSql, [
503
503
  systemInfo.mode || 'text',
@@ -556,24 +556,24 @@ export async function getDocumentsByContentType(connection, contentType) {
556
556
  export async function getChunksByContentType(connection, contentType) {
557
557
  try {
558
558
  validateContentType(contentType);
559
- const sql = `
560
- SELECT
561
- c.id,
562
- c.embedding_id,
563
- c.document_id,
564
- c.content,
565
- c.content_type,
566
- c.chunk_index,
567
- c.metadata,
568
- c.created_at,
569
- d.source as document_source,
570
- d.title as document_title,
571
- d.content_type as document_content_type,
572
- d.content_id as document_content_id
573
- FROM chunks c
574
- JOIN documents d ON c.document_id = d.id
575
- WHERE c.content_type = ?
576
- ORDER BY d.source, c.chunk_index
559
+ const sql = `
560
+ SELECT
561
+ c.id,
562
+ c.embedding_id,
563
+ c.document_id,
564
+ c.content,
565
+ c.content_type,
566
+ c.chunk_index,
567
+ c.metadata,
568
+ c.created_at,
569
+ d.source as document_source,
570
+ d.title as document_title,
571
+ d.content_type as document_content_type,
572
+ d.content_id as document_content_id
573
+ FROM chunks c
574
+ JOIN documents d ON c.document_id = d.id
575
+ WHERE c.content_type = ?
576
+ ORDER BY d.source, c.chunk_index
577
577
  `;
578
578
  const results = await connection.all(sql, [contentType]);
579
579
  // Parse metadata JSON strings back to objects
@@ -594,16 +594,16 @@ export async function getChunksByContentType(connection, contentType) {
594
594
  export async function getContentTypeStatistics(connection) {
595
595
  try {
596
596
  // Get document statistics
597
- const docStats = await connection.all(`
598
- SELECT content_type, COUNT(*) as count
599
- FROM documents
600
- GROUP BY content_type
597
+ const docStats = await connection.all(`
598
+ SELECT content_type, COUNT(*) as count
599
+ FROM documents
600
+ GROUP BY content_type
601
601
  `);
602
602
  // Get chunk statistics
603
- const chunkStats = await connection.all(`
604
- SELECT content_type, COUNT(*) as count
605
- FROM chunks
606
- GROUP BY content_type
603
+ const chunkStats = await connection.all(`
604
+ SELECT content_type, COUNT(*) as count
605
+ FROM chunks
606
+ GROUP BY content_type
607
607
  `);
608
608
  // Get totals
609
609
  const totalDocs = await connection.get('SELECT COUNT(*) as count FROM documents');
@@ -672,11 +672,11 @@ export async function updateChunkMetadata(connection, chunkId, metadata) {
672
672
  */
673
673
  export async function insertContentMetadata(connection, contentMetadata) {
674
674
  try {
675
- await connection.run(`
676
- INSERT INTO content_metadata (
677
- id, storage_type, original_path, content_path, display_name,
678
- content_type, file_size, content_hash
679
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
675
+ await connection.run(`
676
+ INSERT INTO content_metadata (
677
+ id, storage_type, original_path, content_path, display_name,
678
+ content_type, file_size, content_hash
679
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
680
680
  `, [
681
681
  contentMetadata.id,
682
682
  contentMetadata.storageType,
@@ -703,11 +703,11 @@ export async function insertContentMetadata(connection, contentMetadata) {
703
703
  */
704
704
  export async function getContentMetadata(connection, contentId) {
705
705
  try {
706
- const result = await connection.get(`
707
- SELECT id, storage_type, original_path, content_path, display_name,
708
- content_type, file_size, content_hash, created_at
709
- FROM content_metadata
710
- WHERE id = ?
706
+ const result = await connection.get(`
707
+ SELECT id, storage_type, original_path, content_path, display_name,
708
+ content_type, file_size, content_hash, created_at
709
+ FROM content_metadata
710
+ WHERE id = ?
711
711
  `, [contentId]);
712
712
  if (!result) {
713
713
  return null;
@@ -736,11 +736,11 @@ export async function getContentMetadata(connection, contentId) {
736
736
  */
737
737
  export async function getContentMetadataByHash(connection, contentHash) {
738
738
  try {
739
- const result = await connection.get(`
740
- SELECT id, storage_type, original_path, content_path, display_name,
741
- content_type, file_size, content_hash, created_at
742
- FROM content_metadata
743
- WHERE content_hash = ?
739
+ const result = await connection.get(`
740
+ SELECT id, storage_type, original_path, content_path, display_name,
741
+ content_type, file_size, content_hash, created_at
742
+ FROM content_metadata
743
+ WHERE content_hash = ?
744
744
  `, [contentHash]);
745
745
  if (!result) {
746
746
  return null;
@@ -769,12 +769,12 @@ export async function getContentMetadataByHash(connection, contentHash) {
769
769
  */
770
770
  export async function getContentMetadataByStorageType(connection, storageType) {
771
771
  try {
772
- const results = await connection.all(`
773
- SELECT id, storage_type, original_path, content_path, display_name,
774
- content_type, file_size, content_hash, created_at
775
- FROM content_metadata
776
- WHERE storage_type = ?
777
- ORDER BY created_at DESC
772
+ const results = await connection.all(`
773
+ SELECT id, storage_type, original_path, content_path, display_name,
774
+ content_type, file_size, content_hash, created_at
775
+ FROM content_metadata
776
+ WHERE storage_type = ?
777
+ ORDER BY created_at DESC
778
778
  `, [storageType]);
779
779
  return results.map((result) => ({
780
780
  id: result.id,
@@ -814,11 +814,11 @@ export async function deleteContentMetadata(connection, contentId) {
814
814
  */
815
815
  export async function getStorageStats(connection) {
816
816
  try {
817
- const result = await connection.get(`
818
- SELECT content_dir_files, content_dir_size, filesystem_refs,
819
- last_cleanup, updated_at
820
- FROM storage_stats
821
- WHERE id = 1
817
+ const result = await connection.get(`
818
+ SELECT content_dir_files, content_dir_size, filesystem_refs,
819
+ last_cleanup, updated_at
820
+ FROM storage_stats
821
+ WHERE id = 1
822
822
  `);
823
823
  if (!result) {
824
824
  return null;
@@ -874,11 +874,11 @@ export async function updateStorageStats(connection, stats) {
874
874
  }
875
875
  else {
876
876
  // Insert new row with provided values and defaults
877
- const insertSql = `
878
- INSERT INTO storage_stats (
879
- id, content_dir_files, content_dir_size, filesystem_refs,
880
- last_cleanup, updated_at
881
- ) VALUES (1, ?, ?, ?, ?, CURRENT_TIMESTAMP)
877
+ const insertSql = `
878
+ INSERT INTO storage_stats (
879
+ id, content_dir_files, content_dir_size, filesystem_refs,
880
+ last_cleanup, updated_at
881
+ ) VALUES (1, ?, ?, ?, ?, CURRENT_TIMESTAMP)
882
882
  `;
883
883
  await connection.run(insertSql, [
884
884
  stats.contentDirFiles || 0,
@@ -162,9 +162,13 @@ export declare class IngestionPipeline {
162
162
  */
163
163
  private storeDocumentsAndChunksWithContentTypes;
164
164
  /**
165
- * Update vector index with new embeddings
165
+ * Update vector index with new embeddings (supports grouped content type storage)
166
166
  */
167
167
  private updateVectorIndex;
168
+ /**
169
+ * Filter documents based on ingestion mode to avoid processing incompatible content types
170
+ */
171
+ private filterDocumentsByMode;
168
172
  /**
169
173
  * Converts MIME type to simple content type for embedding function
170
174
  * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')