smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
-- Database fix script for SmartRAG search issues
|
|
2
|
+
-- This script fixes three bugs:
|
|
3
|
+
-- 1. Incorrect jieba configuration name
|
|
4
|
+
-- 2. Wrong language in existing data
|
|
5
|
+
-- 3. Rebuild fulltext indexes with correct tokenizer
|
|
6
|
+
|
|
7
|
+
BEGIN;
|
|
8
|
+
|
|
9
|
+
-- Fix 1: Update text_search_configs to use correct jieba config name
|
|
10
|
+
UPDATE text_search_configs
|
|
11
|
+
SET config_name = 'jiebacfg'
|
|
12
|
+
WHERE config_name = 'jieba';
|
|
13
|
+
|
|
14
|
+
-- Verify the fix
|
|
15
|
+
SELECT 'Fixed text_search_configs: Changed jieba to jiebacfg' AS status,
|
|
16
|
+
COUNT(*) as updated_rows
|
|
17
|
+
FROM text_search_configs
|
|
18
|
+
WHERE config_name = 'jiebacfg';
|
|
19
|
+
|
|
20
|
+
-- Fix 2: Detect and update language for source_documents
|
|
21
|
+
-- For documents with Chinese content, set language to 'zh'
|
|
22
|
+
-- First, identify documents with Chinese content based on sections
|
|
23
|
+
WITH chinese_sections AS (
|
|
24
|
+
SELECT DISTINCT ss.document_id
|
|
25
|
+
FROM source_sections ss
|
|
26
|
+
WHERE ss.content ~ '[\u4e00-\u9fff]'
|
|
27
|
+
)
|
|
28
|
+
UPDATE source_documents sd
|
|
29
|
+
SET language = 'zh'
|
|
30
|
+
WHERE sd.id IN (SELECT document_id FROM chinese_sections)
|
|
31
|
+
AND (sd.language = 'en' OR sd.language IS NULL OR sd.language = '');
|
|
32
|
+
|
|
33
|
+
-- Verify the fix
|
|
34
|
+
SELECT 'Fixed source_documents language: Set to zh for Chinese documents' AS status,
|
|
35
|
+
COUNT(*) as updated_docs
|
|
36
|
+
FROM source_documents
|
|
37
|
+
WHERE language = 'zh';
|
|
38
|
+
|
|
39
|
+
-- Fix 3: Rebuild fulltext indexes
|
|
40
|
+
-- Delete existing fulltext indexes
|
|
41
|
+
DELETE FROM section_fts;
|
|
42
|
+
|
|
43
|
+
-- Rebuild fulltext indexes using the trigger
|
|
44
|
+
-- The trigger will use the updated language and correct config name
|
|
45
|
+
INSERT INTO section_fts (section_id, document_id, language, fts_title, fts_content, fts_combined)
|
|
46
|
+
SELECT
|
|
47
|
+
ss.id,
|
|
48
|
+
ss.document_id,
|
|
49
|
+
COALESCE(sd.language, 'zh'),
|
|
50
|
+
NULL,
|
|
51
|
+
NULL,
|
|
52
|
+
NULL
|
|
53
|
+
FROM source_sections ss
|
|
54
|
+
JOIN source_documents sd ON sd.id = ss.document_id;
|
|
55
|
+
|
|
56
|
+
-- Now update the fts fields by calling the trigger
|
|
57
|
+
-- This will cause the trigger to fire and rebuild with correct language/tokenizer
|
|
58
|
+
UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP;
|
|
59
|
+
|
|
60
|
+
COMMIT;
|
|
61
|
+
|
|
62
|
+
-- Verification queries
|
|
63
|
+
-- Check updated text_search_configs
|
|
64
|
+
SELECT '=== Verification: text_search_configs ===' AS info;
|
|
65
|
+
SELECT language_code, config_name FROM text_search_configs WHERE language_code LIKE 'zh%';
|
|
66
|
+
|
|
67
|
+
-- Check updated source_documents
|
|
68
|
+
SELECT '=== Verification: source_documents ===' AS info;
|
|
69
|
+
SELECT id, title, language FROM source_documents;
|
|
70
|
+
|
|
71
|
+
-- Check rebuilt section_fts
|
|
72
|
+
SELECT '=== Verification: section_fts sample ===' AS info;
|
|
73
|
+
SELECT section_id, language FROM section_fts LIMIT 5;
|
|
74
|
+
|
|
75
|
+
-- Test fulltext search with Chinese
|
|
76
|
+
SELECT '=== Test: Fulltext search with Chinese ===' AS info;
|
|
77
|
+
SELECT ss.section_title, ssf.language
|
|
78
|
+
FROM section_fts ssf
|
|
79
|
+
JOIN source_sections ss ON ss.id = ssf.section_id
|
|
80
|
+
WHERE ssf.fts_combined @@ to_tsquery('jiebacfg', '小动物')
|
|
81
|
+
LIMIT 3;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :source_documents do
|
|
4
|
+
primary_key :id
|
|
5
|
+
String :title, null: false, size: 255
|
|
6
|
+
String :url, text: true
|
|
7
|
+
String :author, size: 255
|
|
8
|
+
Date :publication_date
|
|
9
|
+
String :language, size: 10, default: 'en'
|
|
10
|
+
String :description, text: true
|
|
11
|
+
# 0: pending, 1: completed, 2: failed
|
|
12
|
+
Integer :download_state, default: 0, null: false
|
|
13
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
14
|
+
DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Add index for common queries
|
|
18
|
+
add_index :source_documents, :download_state
|
|
19
|
+
add_index :source_documents, :created_at
|
|
20
|
+
add_index :source_documents, :language
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
down do
|
|
24
|
+
drop_table :source_documents
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :source_sections do
|
|
4
|
+
primary_key :id
|
|
5
|
+
foreign_key :document_id, :source_documents, null: false, on_delete: :cascade
|
|
6
|
+
String :content, text: true, null: false
|
|
7
|
+
String :section_title, size: 500
|
|
8
|
+
Integer :section_number, default: 0
|
|
9
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
10
|
+
DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
add_index :source_sections, :document_id
|
|
14
|
+
add_index :source_sections, :section_number
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
down do
|
|
18
|
+
drop_table :source_sections
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :tags do
|
|
4
|
+
primary_key :id
|
|
5
|
+
String :name, null: false, unique: true, size: 255
|
|
6
|
+
foreign_key :parent_id, :tags, on_delete: :set_null
|
|
7
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
add_index :tags, :name
|
|
11
|
+
add_index :tags, :parent_id
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
down do
|
|
15
|
+
drop_table :tags
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :research_topics do
|
|
4
|
+
primary_key :id
|
|
5
|
+
String :name, null: false, unique: true, size: 255
|
|
6
|
+
String :description, text: true
|
|
7
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
add_index :research_topics, :name
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
down do
|
|
14
|
+
drop_table :research_topics
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
# Section tags (many-to-many)
|
|
4
|
+
create_table :section_tags do
|
|
5
|
+
foreign_key :section_id, :source_sections, null: false, on_delete: :cascade
|
|
6
|
+
foreign_key :tag_id, :tags, null: false, on_delete: :cascade
|
|
7
|
+
primary_key [:section_id, :tag_id]
|
|
8
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
add_index :section_tags, [:section_id, :tag_id], unique: true
|
|
12
|
+
add_index :section_tags, :tag_id
|
|
13
|
+
|
|
14
|
+
# Research topic sections (many-to-many)
|
|
15
|
+
create_table :research_topic_sections do
|
|
16
|
+
foreign_key :research_topic_id, :research_topics, null: false, on_delete: :cascade
|
|
17
|
+
foreign_key :section_id, :source_sections, null: false, on_delete: :cascade
|
|
18
|
+
primary_key [:research_topic_id, :section_id]
|
|
19
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
add_index :research_topic_sections, [:research_topic_id, :section_id], unique: true
|
|
23
|
+
add_index :research_topic_sections, :section_id
|
|
24
|
+
|
|
25
|
+
# Research topic tags (many-to-many)
|
|
26
|
+
create_table :research_topic_tags do
|
|
27
|
+
foreign_key :research_topic_id, :research_topics, null: false, on_delete: :cascade
|
|
28
|
+
foreign_key :tag_id, :tags, null: false, on_delete: :cascade
|
|
29
|
+
primary_key [:research_topic_id, :tag_id]
|
|
30
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
add_index :research_topic_tags, [:research_topic_id, :tag_id], unique: true
|
|
34
|
+
add_index :research_topic_tags, :tag_id
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
down do
|
|
38
|
+
drop_table :research_topic_tags
|
|
39
|
+
drop_table :research_topic_sections
|
|
40
|
+
drop_table :section_tags
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :text_search_configs do
|
|
4
|
+
String :language_code, primary_key: true
|
|
5
|
+
String :config_name, null: false
|
|
6
|
+
TrueClass :is_installed, default: true
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
# Add index for config name lookup
|
|
10
|
+
add_index :text_search_configs, :config_name
|
|
11
|
+
|
|
12
|
+
# Seed initial language configurations
|
|
13
|
+
# Note: These assume pg_jieba extension is installed for Chinese support
|
|
14
|
+
from(:text_search_configs).multi_insert(
|
|
15
|
+
[
|
|
16
|
+
{ language_code: 'en', config_name: 'pg_catalog.english' },
|
|
17
|
+
{ language_code: 'zh', config_name: 'jiebacfg' },
|
|
18
|
+
{ language_code: 'ja', config_name: 'pg_catalog.simple' },
|
|
19
|
+
{ language_code: 'ko', config_name: 'pg_catalog.simple' },
|
|
20
|
+
{ language_code: 'default', config_name: 'pg_catalog.simple' }
|
|
21
|
+
]
|
|
22
|
+
)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
down do
|
|
26
|
+
drop_table :text_search_configs
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :section_fts do
|
|
4
|
+
# One-to-one relationship with source_sections
|
|
5
|
+
primary_key :section_id
|
|
6
|
+
foreign_key :document_id, :source_documents, on_delete: :cascade
|
|
7
|
+
String :language, null: false, default: 'en'
|
|
8
|
+
column :fts_title, 'tsvector'
|
|
9
|
+
column :fts_content, 'tsvector'
|
|
10
|
+
column :fts_combined, 'tsvector'
|
|
11
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
12
|
+
DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Primary composite index for combined search
|
|
16
|
+
add_index :section_fts, :fts_combined, type: :gin
|
|
17
|
+
add_index :section_fts, :language
|
|
18
|
+
add_index :section_fts, :fts_title, type: :gin
|
|
19
|
+
|
|
20
|
+
# Partitioned indexes for frequently used languages
|
|
21
|
+
# These improve query performance for language-specific searches
|
|
22
|
+
run 'CREATE INDEX section_fts_gin_zh ON section_fts USING GIN (fts_combined) WHERE language = \'zh\''
|
|
23
|
+
run 'CREATE INDEX section_fts_gin_en ON section_fts USING GIN (fts_combined) WHERE language = \'en\''
|
|
24
|
+
run 'CREATE INDEX section_fts_gin_ja ON section_fts USING GIN (fts_combined) WHERE language = \'ja\''
|
|
25
|
+
|
|
26
|
+
# Foreign key index
|
|
27
|
+
add_index :section_fts, :document_id
|
|
28
|
+
|
|
29
|
+
# Create trigger function to automatically maintain FTS data
|
|
30
|
+
run <<-SQL
|
|
31
|
+
CREATE OR REPLACE FUNCTION update_section_fts()
|
|
32
|
+
RETURNS TRIGGER AS $$
|
|
33
|
+
DECLARE
|
|
34
|
+
v_language TEXT;
|
|
35
|
+
v_config TEXT;
|
|
36
|
+
BEGIN
|
|
37
|
+
-- Get document language
|
|
38
|
+
SELECT COALESCE(sd.language, 'en') INTO v_language
|
|
39
|
+
FROM source_documents sd
|
|
40
|
+
WHERE sd.id = NEW.document_id;
|
|
41
|
+
|
|
42
|
+
-- Get corresponding text search configuration
|
|
43
|
+
SELECT COALESCE(tsc.config_name, 'pg_catalog.simple') INTO v_config
|
|
44
|
+
FROM text_search_configs tsc
|
|
45
|
+
WHERE tsc.language_code = v_language;
|
|
46
|
+
|
|
47
|
+
-- Maintain full-text search data
|
|
48
|
+
INSERT INTO section_fts (section_id, document_id, language, fts_title, fts_content, fts_combined)
|
|
49
|
+
VALUES (
|
|
50
|
+
NEW.id,
|
|
51
|
+
NEW.document_id,
|
|
52
|
+
v_language,
|
|
53
|
+
setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A'),
|
|
54
|
+
setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B'),
|
|
55
|
+
setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A') ||
|
|
56
|
+
setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B')
|
|
57
|
+
)
|
|
58
|
+
ON CONFLICT (section_id) DO UPDATE SET
|
|
59
|
+
document_id = NEW.document_id,
|
|
60
|
+
language = v_language,
|
|
61
|
+
fts_title = setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A'),
|
|
62
|
+
fts_content = setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B'),
|
|
63
|
+
fts_combined = setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A') ||
|
|
64
|
+
setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B'),
|
|
65
|
+
updated_at = CURRENT_TIMESTAMP;
|
|
66
|
+
|
|
67
|
+
RETURN NEW;
|
|
68
|
+
END;
|
|
69
|
+
$$ LANGUAGE plpgsql;
|
|
70
|
+
SQL
|
|
71
|
+
|
|
72
|
+
# Create trigger on source_sections table
|
|
73
|
+
run <<-SQL
|
|
74
|
+
CREATE TRIGGER trigger_update_section_fts
|
|
75
|
+
AFTER INSERT OR UPDATE ON source_sections
|
|
76
|
+
FOR EACH ROW EXECUTE FUNCTION update_section_fts();
|
|
77
|
+
SQL
|
|
78
|
+
|
|
79
|
+
# Also create trigger for document language changes
|
|
80
|
+
run <<-SQL
|
|
81
|
+
CREATE OR REPLACE FUNCTION update_section_fts_on_doc_update()
|
|
82
|
+
RETURNS TRIGGER AS $$
|
|
83
|
+
BEGIN
|
|
84
|
+
-- Update FTS for all sections when document language changes
|
|
85
|
+
IF NEW.language IS DISTINCT FROM OLD.language THEN
|
|
86
|
+
UPDATE section_fts
|
|
87
|
+
SET updated_at = CURRENT_TIMESTAMP
|
|
88
|
+
WHERE document_id = NEW.id;
|
|
89
|
+
END IF;
|
|
90
|
+
RETURN NEW;
|
|
91
|
+
END;
|
|
92
|
+
$$ LANGUAGE plpgsql;
|
|
93
|
+
SQL
|
|
94
|
+
|
|
95
|
+
run <<-SQL
|
|
96
|
+
CREATE TRIGGER trigger_update_section_fts_on_doc
|
|
97
|
+
AFTER UPDATE OF language ON source_documents
|
|
98
|
+
FOR EACH ROW EXECUTE FUNCTION update_section_fts_on_doc_update();
|
|
99
|
+
SQL
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
down do
|
|
103
|
+
run 'DROP TRIGGER IF EXISTS trigger_update_section_fts_on_doc ON source_documents'
|
|
104
|
+
run 'DROP FUNCTION IF EXISTS update_section_fts_on_doc_update()'
|
|
105
|
+
run 'DROP TRIGGER IF EXISTS trigger_update_section_fts ON source_sections'
|
|
106
|
+
run 'DROP FUNCTION IF EXISTS update_section_fts()'
|
|
107
|
+
drop_table :section_fts
|
|
108
|
+
end
|
|
109
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
# Create vector extension if it doesn't exist
|
|
4
|
+
run 'CREATE EXTENSION IF NOT EXISTS vector'
|
|
5
|
+
|
|
6
|
+
create_table :embeddings do
|
|
7
|
+
primary_key :id
|
|
8
|
+
foreign_key :source_id, :source_sections, null: false, on_delete: :cascade
|
|
9
|
+
# Vector dimension size (adjust based on embedding model)
|
|
10
|
+
column :vector, 'vector(1024)', null: false
|
|
11
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Create IVFFLAT index for approximate nearest neighbor search
|
|
15
|
+
# IVFFLAT provides good accuracy with fast search
|
|
16
|
+
run 'CREATE INDEX idx_embedding_vector ON embeddings USING ivfflat (vector vector_cosine_ops) WITH (lists = 100)'
|
|
17
|
+
|
|
18
|
+
# Composite index for source_id lookups
|
|
19
|
+
add_index :embeddings, :source_id
|
|
20
|
+
|
|
21
|
+
# Additional index for faster lookups during similarity search
|
|
22
|
+
add_index :embeddings, :created_at
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
down do
|
|
26
|
+
drop_table :embeddings
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
create_table :search_logs do
|
|
4
|
+
primary_key :id
|
|
5
|
+
String :query, null: false, text: true
|
|
6
|
+
String :search_type, size: 20 # 'vector', 'fulltext', 'hybrid'
|
|
7
|
+
Integer :execution_time_ms
|
|
8
|
+
Integer :results_count
|
|
9
|
+
column :query_vector, 'vector(1024)' # Store query vector for analysis
|
|
10
|
+
column :result_ids, 'integer[]' # Store result IDs for relevance analysis
|
|
11
|
+
column :filters, 'jsonb' # Store search filters
|
|
12
|
+
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Indexes for performance monitoring
|
|
16
|
+
add_index :search_logs, :created_at
|
|
17
|
+
add_index :search_logs, :search_type
|
|
18
|
+
add_index :search_logs, :execution_time_ms
|
|
19
|
+
|
|
20
|
+
# GIN index for full-text search on queries
|
|
21
|
+
run "CREATE INDEX search_logs_query_idx ON search_logs USING gin (to_tsvector('simple', query))"
|
|
22
|
+
|
|
23
|
+
# Composite index for common analytics queries
|
|
24
|
+
add_index :search_logs, [:search_type, :created_at]
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
down do
|
|
28
|
+
drop_table :search_logs
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Sequel.migration do
|
|
2
|
+
up do
|
|
3
|
+
add_column :source_documents, :source_type, String, size: 50, default: 'manual'
|
|
4
|
+
add_column :source_documents, :source_uri, String, text: true
|
|
5
|
+
add_column :source_documents, :content_hash, String, size: 128
|
|
6
|
+
|
|
7
|
+
add_index :source_documents, :source_type
|
|
8
|
+
add_index :source_documents, :source_uri
|
|
9
|
+
add_index :source_documents, :content_hash
|
|
10
|
+
add_index :source_documents, [:source_uri, :content_hash]
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
down do
|
|
14
|
+
drop_index :source_documents, [:source_uri, :content_hash]
|
|
15
|
+
drop_index :source_documents, :content_hash
|
|
16
|
+
drop_index :source_documents, :source_uri
|
|
17
|
+
drop_index :source_documents, :source_type
|
|
18
|
+
|
|
19
|
+
drop_column :source_documents, :content_hash
|
|
20
|
+
drop_column :source_documents, :source_uri
|
|
21
|
+
drop_column :source_documents, :source_type
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
-- Complete rebuild of section_fts fulltext indexes
|
|
2
|
+
-- This script rebuilds all fulltext search indexes with correct language and tokenizer
|
|
3
|
+
|
|
4
|
+
BEGIN;
|
|
5
|
+
|
|
6
|
+
-- 1. Delete all existing fts data
|
|
7
|
+
DELETE FROM section_fts;
|
|
8
|
+
|
|
9
|
+
-- 2. Insert all sections with basic info
|
|
10
|
+
INSERT INTO section_fts (section_id, document_id, language)
|
|
11
|
+
SELECT
|
|
12
|
+
ss.id,
|
|
13
|
+
ss.document_id,
|
|
14
|
+
COALESCE(sd.language, 'zh') as language
|
|
15
|
+
FROM source_sections ss
|
|
16
|
+
JOIN source_documents sd ON sd.id = ss.document_id;
|
|
17
|
+
|
|
18
|
+
-- 3. Trigger updates by touching all sections
|
|
19
|
+
-- This will cause the trigger to fire and rebuild fts vectors
|
|
20
|
+
-- We do this in batches to avoid locking
|
|
21
|
+
UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP WHERE id IN (
|
|
22
|
+
SELECT id FROM source_sections LIMIT 1000
|
|
23
|
+
);
|
|
24
|
+
|
|
25
|
+
UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP WHERE id IN (
|
|
26
|
+
SELECT id FROM source_sections LIMIT 1000 OFFSET 1000
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP WHERE id IN (
|
|
30
|
+
SELECT id FROM source_sections LIMIT 1000 OFFSET 2000
|
|
31
|
+
);
|
|
32
|
+
|
|
33
|
+
-- Verify rebuild
|
|
34
|
+
SELECT '=== Verification ===' as info;
|
|
35
|
+
SELECT COUNT(*) as total_sections FROM section_fts;
|
|
36
|
+
SELECT COUNT(*) as sections_with_fts_title FROM section_fts WHERE fts_title IS NOT NULL;
|
|
37
|
+
SELECT COUNT(*) as sections_with_fts_content FROM section_fts WHERE fts_content IS NOT NULL;
|
|
38
|
+
SELECT COUNT(*) as sections_with_fts_combined FROM section_fts WHERE fts_combined IS NOT NULL;
|
|
39
|
+
|
|
40
|
+
-- Test search
|
|
41
|
+
SELECT '=== Test Search for 老人情感 ===' as info;
|
|
42
|
+
SELECT section_id, section_title
|
|
43
|
+
FROM section_fts ssf
|
|
44
|
+
JOIN source_sections ss ON ss.id = ssf.section_id
|
|
45
|
+
WHERE ssf.fts_combined @@ plainto_tsquery('jiebacfg', '老人情感')
|
|
46
|
+
AND ssf.language = 'zh'
|
|
47
|
+
LIMIT 5;
|
|
48
|
+
|
|
49
|
+
COMMIT;
|
|
50
|
+
|
|
51
|
+
SELECT '=== Complete! All fulltext indexes rebuilt ===' as status;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
-- Seed data for text_search_configs table
|
|
2
|
+
-- This file contains the standard language configurations for full-text search
|
|
3
|
+
|
|
4
|
+
-- Clear existing data
|
|
5
|
+
TRUNCATE TABLE text_search_configs;
|
|
6
|
+
|
|
7
|
+
-- Insert language configurations
|
|
8
|
+
INSERT INTO text_search_configs (language_code, config_name, is_installed) VALUES
|
|
9
|
+
('en', 'pg_catalog.english', true),
|
|
10
|
+
('en_us', 'pg_catalog.english', true),
|
|
11
|
+
('en_gb', 'pg_catalog.english', true),
|
|
12
|
+
('zh', 'jiebacfg', true),
|
|
13
|
+
('zh_cn', 'jiebacfg', true),
|
|
14
|
+
('zh_tw', 'jiebacfg', true),
|
|
15
|
+
('ja', 'pg_catalog.simple', true),
|
|
16
|
+
('ja_jp', 'pg_catalog.simple', true),
|
|
17
|
+
('ko', 'pg_catalog.simple', true),
|
|
18
|
+
('ko_kr', 'pg_catalog.simple', true),
|
|
19
|
+
('es', 'pg_catalog.spanish', true),
|
|
20
|
+
('fr', 'pg_catalog.french', true),
|
|
21
|
+
('de', 'pg_catalog.german', true),
|
|
22
|
+
('it', 'pg_catalog.italian', true),
|
|
23
|
+
('ru', 'pg_catalog.russian', true),
|
|
24
|
+
('ar', 'pg_catalog.simple', true),
|
|
25
|
+
('default', 'pg_catalog.simple', true);
|
|
26
|
+
|
|
27
|
+
-- Add comment for jieba configuration
|
|
28
|
+
COMMENT ON COLUMN text_search_configs.config_name IS 'Full-text search configuration name. For Chinese support, ensure pg_jieba extension is installed';
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "common"
|
|
5
|
+
|
|
6
|
+
include Examples::Common
|
|
7
|
+
|
|
8
|
+
print_header("Quick Start")
|
|
9
|
+
smart_rag = build_client
|
|
10
|
+
|
|
11
|
+
stats = smart_rag.statistics
|
|
12
|
+
puts "SmartRAG initialized."
|
|
13
|
+
puts "Documents in DB: #{stats[:document_count]}"
|
|
14
|
+
|
|
15
|
+
query = ARGV[0] || "machine learning algorithms"
|
|
16
|
+
results = smart_rag.search(
|
|
17
|
+
query,
|
|
18
|
+
search_type: "hybrid",
|
|
19
|
+
limit: 5,
|
|
20
|
+
include_content: true,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
print_header("First Search: #{query}")
|
|
24
|
+
results.fetch(:results, []).each_with_index do |result, idx|
|
|
25
|
+
score = result[:combined_score] || result[:similarity] || 0.0
|
|
26
|
+
puts "#{idx + 1}. #{result[:section_title]} (score: #{score.round(3)})"
|
|
27
|
+
next unless result[:content]
|
|
28
|
+
|
|
29
|
+
preview = result[:content][0, 120].to_s.gsub(/\s+/, " ")
|
|
30
|
+
puts " #{preview}..."
|
|
31
|
+
end
|
|
32
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "common"
|
|
5
|
+
|
|
6
|
+
include Examples::Common
|
|
7
|
+
|
|
8
|
+
print_header("Document Management")
|
|
9
|
+
smart_rag = build_client
|
|
10
|
+
|
|
11
|
+
document_path = ARGV[0]
|
|
12
|
+
if document_path.nil? || document_path.strip.empty?
|
|
13
|
+
warn "Usage: ruby examples/02_document_management.rb /path/to/document.md"
|
|
14
|
+
exit 1
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
add_result = smart_rag.add_document(
|
|
18
|
+
document_path,
|
|
19
|
+
title: File.basename(document_path),
|
|
20
|
+
generate_embeddings: true,
|
|
21
|
+
generate_tags: true,
|
|
22
|
+
tags: ["example", "usage_examples"],
|
|
23
|
+
metadata: { source: "examples/02_document_management.rb" },
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
print_json("Add Result", add_result)
|
|
27
|
+
|
|
28
|
+
document_id = add_result[:document_id]
|
|
29
|
+
detail = smart_rag.get_document(document_id)
|
|
30
|
+
print_json("Document Detail", detail || {})
|
|
31
|
+
|
|
32
|
+
list = smart_rag.list_documents(page: 1, per_page: 10, search: File.basename(document_path))
|
|
33
|
+
print_json("Document List", list)
|
|
34
|
+
|
|
35
|
+
# Pass DELETE=1 to demonstrate cleanup:
|
|
36
|
+
# DELETE=1 ruby examples/02_document_management.rb test/python_basics.md
|
|
37
|
+
if ENV["DELETE"] == "1"
|
|
38
|
+
delete_result = smart_rag.remove_document(document_id)
|
|
39
|
+
print_json("Delete Result", delete_result)
|
|
40
|
+
end
|
|
41
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "common"
|
|
5
|
+
|
|
6
|
+
include Examples::Common
|
|
7
|
+
|
|
8
|
+
print_header("Search Operations")
|
|
9
|
+
smart_rag = build_client
|
|
10
|
+
|
|
11
|
+
query = ARGV[0] || "deep learning applications in healthcare"
|
|
12
|
+
|
|
13
|
+
hybrid = smart_rag.search(
|
|
14
|
+
query,
|
|
15
|
+
search_type: "hybrid",
|
|
16
|
+
limit: 5,
|
|
17
|
+
alpha: 0.7,
|
|
18
|
+
include_metadata: true,
|
|
19
|
+
)
|
|
20
|
+
print_json("Hybrid Search Metadata", hybrid[:metadata] || {})
|
|
21
|
+
|
|
22
|
+
vector = smart_rag.vector_search(
|
|
23
|
+
"neural network architectures",
|
|
24
|
+
limit: 5,
|
|
25
|
+
include_content: false,
|
|
26
|
+
)
|
|
27
|
+
print_json("Vector Search Metadata", vector[:metadata] || {})
|
|
28
|
+
|
|
29
|
+
fulltext = smart_rag.fulltext_search(
|
|
30
|
+
'"deep reinforcement learning"',
|
|
31
|
+
limit: 5,
|
|
32
|
+
)
|
|
33
|
+
print_json("Fulltext Search Metadata", fulltext[:metadata] || {})
|
|
34
|
+
|
|
35
|
+
multilingual_queries = [
|
|
36
|
+
["zh_cn", "人工智能应用"],
|
|
37
|
+
["ja", "機械学習アルゴリズム"],
|
|
38
|
+
["ko", "딥러닝 모델"],
|
|
39
|
+
["auto", "AI和机器学习的发展"],
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
multilingual_queries.each do |language, q|
|
|
43
|
+
result = smart_rag.search(q, search_type: "hybrid", language: language, limit: 3)
|
|
44
|
+
puts "[#{language}] #{q} -> #{result.fetch(:results, []).length} results"
|
|
45
|
+
end
|
|
46
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "common"
|
|
5
|
+
|
|
6
|
+
include Examples::Common
|
|
7
|
+
|
|
8
|
+
print_header("Topics And Tags")
|
|
9
|
+
smart_rag = build_client
|
|
10
|
+
|
|
11
|
+
topic = smart_rag.create_topic(
|
|
12
|
+
"AI in Healthcare",
|
|
13
|
+
"Applications of AI in medical diagnosis and treatment",
|
|
14
|
+
tags: ["ai", "healthcare", "diagnosis"],
|
|
15
|
+
document_ids: [],
|
|
16
|
+
)
|
|
17
|
+
print_json("Created Topic", topic)
|
|
18
|
+
|
|
19
|
+
topic_detail = smart_rag.get_topic(topic[:id])
|
|
20
|
+
print_json("Topic Detail", topic_detail || {})
|
|
21
|
+
|
|
22
|
+
all_topics = smart_rag.list_topics(page: 1, per_page: 20)
|
|
23
|
+
puts "Topic count on page: #{all_topics.fetch(:topics, []).length}"
|
|
24
|
+
|
|
25
|
+
recommendations = smart_rag.get_topic_recommendations(topic[:id], limit: 5)
|
|
26
|
+
print_json("Topic Recommendations", recommendations)
|
|
27
|
+
|
|
28
|
+
sample_text = <<~TEXT
|
|
29
|
+
Machine learning is a subset of artificial intelligence that enables systems
|
|
30
|
+
to learn and improve from experience without explicit programming.
|
|
31
|
+
TEXT
|
|
32
|
+
|
|
33
|
+
tags = smart_rag.generate_tags(sample_text, context: "AI introduction", max_tags: 5)
|
|
34
|
+
print_json("Generated Tags", tags)
|
|
35
|
+
|
|
36
|
+
tag_page = smart_rag.list_tags(page: 1, per_page: 20)
|
|
37
|
+
puts "Tag count on page: #{tag_page.fetch(:tags, []).length}"
|
|
38
|
+
|