smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,81 @@
1
+ -- Database fix script for SmartRAG search issues
2
+ -- This script fixes three bugs:
3
+ -- 1. Incorrect jieba configuration name
4
+ -- 2. Wrong language in existing data
5
+ -- 3. Rebuild fulltext indexes with correct tokenizer
6
+
7
+ BEGIN;
8
+
9
+ -- Fix 1: Update text_search_configs to use correct jieba config name
10
+ UPDATE text_search_configs
11
+ SET config_name = 'jiebacfg'
12
+ WHERE config_name = 'jieba';
13
+
14
+ -- Verify the fix
15
+ SELECT 'Fixed text_search_configs: Changed jieba to jiebacfg' AS status,
16
+ COUNT(*) as updated_rows
17
+ FROM text_search_configs
18
+ WHERE config_name = 'jiebacfg';
19
+
20
+ -- Fix 2: Detect and update language for source_documents
21
+ -- For documents with Chinese content, set language to 'zh'
22
+ -- First, identify documents with Chinese content based on sections
23
+ WITH chinese_sections AS (
24
+ SELECT DISTINCT ss.document_id
25
+ FROM source_sections ss
26
+ WHERE ss.content ~ '[\u4e00-\u9fff]'
27
+ )
28
+ UPDATE source_documents sd
29
+ SET language = 'zh'
30
+ WHERE sd.id IN (SELECT document_id FROM chinese_sections)
31
+ AND (sd.language = 'en' OR sd.language IS NULL OR sd.language = '');
32
+
33
+ -- Verify the fix
34
+ SELECT 'Fixed source_documents language: Set to zh for Chinese documents' AS status,
35
+ COUNT(*) as updated_docs
36
+ FROM source_documents
37
+ WHERE language = 'zh';
38
+
39
+ -- Fix 3: Rebuild fulltext indexes
40
+ -- Delete existing fulltext indexes
41
+ DELETE FROM section_fts;
42
+
43
+ -- Rebuild fulltext indexes using the trigger
44
+ -- The trigger will use the updated language and correct config name
45
+ INSERT INTO section_fts (section_id, document_id, language, fts_title, fts_content, fts_combined)
46
+ SELECT
47
+ ss.id,
48
+ ss.document_id,
49
+ COALESCE(sd.language, 'zh'),
50
+ NULL,
51
+ NULL,
52
+ NULL
53
+ FROM source_sections ss
54
+ JOIN source_documents sd ON sd.id = ss.document_id;
55
+
56
+ -- Now update the fts fields by calling the trigger
57
+ -- This will cause the trigger to fire and rebuild with correct language/tokenizer
58
+ UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP;
59
+
60
+ COMMIT;
61
+
62
+ -- Verification queries
63
+ -- Check updated text_search_configs
64
+ SELECT '=== Verification: text_search_configs ===' AS info;
65
+ SELECT language_code, config_name FROM text_search_configs WHERE language_code LIKE 'zh%';
66
+
67
+ -- Check updated source_documents
68
+ SELECT '=== Verification: source_documents ===' AS info;
69
+ SELECT id, title, language FROM source_documents;
70
+
71
+ -- Check rebuilt section_fts
72
+ SELECT '=== Verification: section_fts sample ===' AS info;
73
+ SELECT section_id, language FROM section_fts LIMIT 5;
74
+
75
+ -- Test fulltext search with Chinese
76
+ SELECT '=== Test: Fulltext search with Chinese ===' AS info;
77
+ SELECT ss.section_title, ssf.language
78
+ FROM section_fts ssf
79
+ JOIN source_sections ss ON ss.id = ssf.section_id
80
+ WHERE ssf.fts_combined @@ to_tsquery('jiebacfg', '小动物')
81
+ LIMIT 3;
@@ -0,0 +1,26 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :source_documents do
4
+ primary_key :id
5
+ String :title, null: false, size: 255
6
+ String :url, text: true
7
+ String :author, size: 255
8
+ Date :publication_date
9
+ String :language, size: 10, default: 'en'
10
+ String :description, text: true
11
+ # 0: pending, 1: completed, 2: failed
12
+ Integer :download_state, default: 0, null: false
13
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
14
+ DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
15
+ end
16
+
17
+ # Add index for common queries
18
+ add_index :source_documents, :download_state
19
+ add_index :source_documents, :created_at
20
+ add_index :source_documents, :language
21
+ end
22
+
23
+ down do
24
+ drop_table :source_documents
25
+ end
26
+ end
@@ -0,0 +1,20 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :source_sections do
4
+ primary_key :id
5
+ foreign_key :document_id, :source_documents, null: false, on_delete: :cascade
6
+ String :content, text: true, null: false
7
+ String :section_title, size: 500
8
+ Integer :section_number, default: 0
9
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
10
+ DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
11
+ end
12
+
13
+ add_index :source_sections, :document_id
14
+ add_index :source_sections, :section_number
15
+ end
16
+
17
+ down do
18
+ drop_table :source_sections
19
+ end
20
+ end
@@ -0,0 +1,17 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :tags do
4
+ primary_key :id
5
+ String :name, null: false, unique: true, size: 255
6
+ foreign_key :parent_id, :tags, on_delete: :set_null
7
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
8
+ end
9
+
10
+ add_index :tags, :name
11
+ add_index :tags, :parent_id
12
+ end
13
+
14
+ down do
15
+ drop_table :tags
16
+ end
17
+ end
@@ -0,0 +1,16 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :research_topics do
4
+ primary_key :id
5
+ String :name, null: false, unique: true, size: 255
6
+ String :description, text: true
7
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
8
+ end
9
+
10
+ add_index :research_topics, :name
11
+ end
12
+
13
+ down do
14
+ drop_table :research_topics
15
+ end
16
+ end
@@ -0,0 +1,42 @@
1
+ Sequel.migration do
2
+ up do
3
+ # Section tags (many-to-many)
4
+ create_table :section_tags do
5
+ foreign_key :section_id, :source_sections, null: false, on_delete: :cascade
6
+ foreign_key :tag_id, :tags, null: false, on_delete: :cascade
7
+ primary_key [:section_id, :tag_id]
8
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
9
+ end
10
+
11
+ add_index :section_tags, [:section_id, :tag_id], unique: true
12
+ add_index :section_tags, :tag_id
13
+
14
+ # Research topic sections (many-to-many)
15
+ create_table :research_topic_sections do
16
+ foreign_key :research_topic_id, :research_topics, null: false, on_delete: :cascade
17
+ foreign_key :section_id, :source_sections, null: false, on_delete: :cascade
18
+ primary_key [:research_topic_id, :section_id]
19
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
20
+ end
21
+
22
+ add_index :research_topic_sections, [:research_topic_id, :section_id], unique: true
23
+ add_index :research_topic_sections, :section_id
24
+
25
+ # Research topic tags (many-to-many)
26
+ create_table :research_topic_tags do
27
+ foreign_key :research_topic_id, :research_topics, null: false, on_delete: :cascade
28
+ foreign_key :tag_id, :tags, null: false, on_delete: :cascade
29
+ primary_key [:research_topic_id, :tag_id]
30
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
31
+ end
32
+
33
+ add_index :research_topic_tags, [:research_topic_id, :tag_id], unique: true
34
+ add_index :research_topic_tags, :tag_id
35
+ end
36
+
37
+ down do
38
+ drop_table :research_topic_tags
39
+ drop_table :research_topic_sections
40
+ drop_table :section_tags
41
+ end
42
+ end
@@ -0,0 +1,28 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :text_search_configs do
4
+ String :language_code, primary_key: true
5
+ String :config_name, null: false
6
+ TrueClass :is_installed, default: true
7
+ end
8
+
9
+ # Add index for config name lookup
10
+ add_index :text_search_configs, :config_name
11
+
12
+ # Seed initial language configurations
13
+ # Note: These assume pg_jieba extension is installed for Chinese support
14
+ from(:text_search_configs).multi_insert(
15
+ [
16
+ { language_code: 'en', config_name: 'pg_catalog.english' },
17
+ { language_code: 'zh', config_name: 'jiebacfg' },
18
+ { language_code: 'ja', config_name: 'pg_catalog.simple' },
19
+ { language_code: 'ko', config_name: 'pg_catalog.simple' },
20
+ { language_code: 'default', config_name: 'pg_catalog.simple' }
21
+ ]
22
+ )
23
+ end
24
+
25
+ down do
26
+ drop_table :text_search_configs
27
+ end
28
+ end
@@ -0,0 +1,109 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :section_fts do
4
+ # One-to-one relationship with source_sections
5
+ primary_key :section_id
6
+ foreign_key :document_id, :source_documents, on_delete: :cascade
7
+ String :language, null: false, default: 'en'
8
+ column :fts_title, 'tsvector'
9
+ column :fts_content, 'tsvector'
10
+ column :fts_combined, 'tsvector'
11
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
12
+ DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
13
+ end
14
+
15
+ # Primary composite index for combined search
16
+ add_index :section_fts, :fts_combined, type: :gin
17
+ add_index :section_fts, :language
18
+ add_index :section_fts, :fts_title, type: :gin
19
+
20
+ # Partitioned indexes for frequently used languages
21
+ # These improve query performance for language-specific searches
22
+ run 'CREATE INDEX section_fts_gin_zh ON section_fts USING GIN (fts_combined) WHERE language = \'zh\''
23
+ run 'CREATE INDEX section_fts_gin_en ON section_fts USING GIN (fts_combined) WHERE language = \'en\''
24
+ run 'CREATE INDEX section_fts_gin_ja ON section_fts USING GIN (fts_combined) WHERE language = \'ja\''
25
+
26
+ # Foreign key index
27
+ add_index :section_fts, :document_id
28
+
29
+ # Create trigger function to automatically maintain FTS data
30
+ run <<-SQL
31
+ CREATE OR REPLACE FUNCTION update_section_fts()
32
+ RETURNS TRIGGER AS $$
33
+ DECLARE
34
+ v_language TEXT;
35
+ v_config TEXT;
36
+ BEGIN
37
+ -- Get document language
38
+ SELECT COALESCE(sd.language, 'en') INTO v_language
39
+ FROM source_documents sd
40
+ WHERE sd.id = NEW.document_id;
41
+
42
+ -- Get corresponding text search configuration
43
+ SELECT COALESCE(tsc.config_name, 'pg_catalog.simple') INTO v_config
44
+ FROM text_search_configs tsc
45
+ WHERE tsc.language_code = v_language;
46
+
47
+ -- Maintain full-text search data
48
+ INSERT INTO section_fts (section_id, document_id, language, fts_title, fts_content, fts_combined)
49
+ VALUES (
50
+ NEW.id,
51
+ NEW.document_id,
52
+ v_language,
53
+ setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A'),
54
+ setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B'),
55
+ setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A') ||
56
+ setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B')
57
+ )
58
+ ON CONFLICT (section_id) DO UPDATE SET
59
+ document_id = NEW.document_id,
60
+ language = v_language,
61
+ fts_title = setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A'),
62
+ fts_content = setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B'),
63
+ fts_combined = setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.section_title,'')), 'A') ||
64
+ setweight(to_tsvector(v_config::regconfig, COALESCE(NEW.content,'')), 'B'),
65
+ updated_at = CURRENT_TIMESTAMP;
66
+
67
+ RETURN NEW;
68
+ END;
69
+ $$ LANGUAGE plpgsql;
70
+ SQL
71
+
72
+ # Create trigger on source_sections table
73
+ run <<-SQL
74
+ CREATE TRIGGER trigger_update_section_fts
75
+ AFTER INSERT OR UPDATE ON source_sections
76
+ FOR EACH ROW EXECUTE FUNCTION update_section_fts();
77
+ SQL
78
+
79
+ # Also create trigger for document language changes
80
+ run <<-SQL
81
+ CREATE OR REPLACE FUNCTION update_section_fts_on_doc_update()
82
+ RETURNS TRIGGER AS $$
83
+ BEGIN
84
+ -- Update FTS for all sections when document language changes
85
+ IF NEW.language IS DISTINCT FROM OLD.language THEN
86
+ UPDATE section_fts
87
+ SET updated_at = CURRENT_TIMESTAMP
88
+ WHERE document_id = NEW.id;
89
+ END IF;
90
+ RETURN NEW;
91
+ END;
92
+ $$ LANGUAGE plpgsql;
93
+ SQL
94
+
95
+ run <<-SQL
96
+ CREATE TRIGGER trigger_update_section_fts_on_doc
97
+ AFTER UPDATE OF language ON source_documents
98
+ FOR EACH ROW EXECUTE FUNCTION update_section_fts_on_doc_update();
99
+ SQL
100
+ end
101
+
102
+ down do
103
+ run 'DROP TRIGGER IF EXISTS trigger_update_section_fts_on_doc ON source_documents'
104
+ run 'DROP FUNCTION IF EXISTS update_section_fts_on_doc_update()'
105
+ run 'DROP TRIGGER IF EXISTS trigger_update_section_fts ON source_sections'
106
+ run 'DROP FUNCTION IF EXISTS update_section_fts()'
107
+ drop_table :section_fts
108
+ end
109
+ end
@@ -0,0 +1,28 @@
1
+ Sequel.migration do
2
+ up do
3
+ # Create vector extension if it doesn't exist
4
+ run 'CREATE EXTENSION IF NOT EXISTS vector'
5
+
6
+ create_table :embeddings do
7
+ primary_key :id
8
+ foreign_key :source_id, :source_sections, null: false, on_delete: :cascade
9
+ # Vector dimension size (adjust based on embedding model)
10
+ column :vector, 'vector(1024)', null: false
11
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
12
+ end
13
+
14
+ # Create IVFFLAT index for approximate nearest neighbor search
15
+ # IVFFLAT provides good accuracy with fast search
16
+ run 'CREATE INDEX idx_embedding_vector ON embeddings USING ivfflat (vector vector_cosine_ops) WITH (lists = 100)'
17
+
18
+ # Composite index for source_id lookups
19
+ add_index :embeddings, :source_id
20
+
21
+ # Additional index for faster lookups during similarity search
22
+ add_index :embeddings, :created_at
23
+ end
24
+
25
+ down do
26
+ drop_table :embeddings
27
+ end
28
+ end
@@ -0,0 +1,30 @@
1
+ Sequel.migration do
2
+ up do
3
+ create_table :search_logs do
4
+ primary_key :id
5
+ String :query, null: false, text: true
6
+ String :search_type, size: 20 # 'vector', 'fulltext', 'hybrid'
7
+ Integer :execution_time_ms
8
+ Integer :results_count
9
+ column :query_vector, 'vector(1024)' # Store query vector for analysis
10
+ column :result_ids, 'integer[]' # Store result IDs for relevance analysis
11
+ column :filters, 'jsonb' # Store search filters
12
+ DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
13
+ end
14
+
15
+ # Indexes for performance monitoring
16
+ add_index :search_logs, :created_at
17
+ add_index :search_logs, :search_type
18
+ add_index :search_logs, :execution_time_ms
19
+
20
+ # GIN index for full-text search on queries
21
+ run "CREATE INDEX search_logs_query_idx ON search_logs USING gin (to_tsvector('simple', query))"
22
+
23
+ # Composite index for common analytics queries
24
+ add_index :search_logs, [:search_type, :created_at]
25
+ end
26
+
27
+ down do
28
+ drop_table :search_logs
29
+ end
30
+ end
@@ -0,0 +1,10 @@
1
+ Sequel.migration do
2
+ up do
3
+ add_column :source_documents, :metadata, :jsonb, default: '{}'
4
+ add_index :source_documents, :metadata, type: :gin
5
+ end
6
+
7
+ down do
8
+ drop_column :source_documents, :metadata
9
+ end
10
+ end
@@ -0,0 +1,23 @@
1
+ Sequel.migration do
2
+ up do
3
+ add_column :source_documents, :source_type, String, size: 50, default: 'manual'
4
+ add_column :source_documents, :source_uri, String, text: true
5
+ add_column :source_documents, :content_hash, String, size: 128
6
+
7
+ add_index :source_documents, :source_type
8
+ add_index :source_documents, :source_uri
9
+ add_index :source_documents, :content_hash
10
+ add_index :source_documents, [:source_uri, :content_hash]
11
+ end
12
+
13
+ down do
14
+ drop_index :source_documents, [:source_uri, :content_hash]
15
+ drop_index :source_documents, :content_hash
16
+ drop_index :source_documents, :source_uri
17
+ drop_index :source_documents, :source_type
18
+
19
+ drop_column :source_documents, :content_hash
20
+ drop_column :source_documents, :source_uri
21
+ drop_column :source_documents, :source_type
22
+ end
23
+ end
@@ -0,0 +1,51 @@
1
+ -- Complete rebuild of section_fts fulltext indexes
2
+ -- This script rebuilds all fulltext search indexes with correct language and tokenizer
3
+
4
+ BEGIN;
5
+
6
+ -- 1. Delete all existing fts data
7
+ DELETE FROM section_fts;
8
+
9
+ -- 2. Insert all sections with basic info
10
+ INSERT INTO section_fts (section_id, document_id, language)
11
+ SELECT
12
+ ss.id,
13
+ ss.document_id,
14
+ COALESCE(sd.language, 'zh') as language
15
+ FROM source_sections ss
16
+ JOIN source_documents sd ON sd.id = ss.document_id;
17
+
18
+ -- 3. Trigger updates by touching all sections
19
+ -- This will cause the trigger to fire and rebuild fts vectors
20
+ -- We do this in batches to avoid locking
21
+ UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP WHERE id IN (
22
+ SELECT id FROM source_sections LIMIT 1000
23
+ );
24
+
25
+ UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP WHERE id IN (
26
+ SELECT id FROM source_sections LIMIT 1000 OFFSET 1000
27
+ );
28
+
29
+ UPDATE source_sections SET updated_at = CURRENT_TIMESTAMP WHERE id IN (
30
+ SELECT id FROM source_sections LIMIT 1000 OFFSET 2000
31
+ );
32
+
33
+ -- Verify rebuild
34
+ SELECT '=== Verification ===' as info;
35
+ SELECT COUNT(*) as total_sections FROM section_fts;
36
+ SELECT COUNT(*) as sections_with_fts_title FROM section_fts WHERE fts_title IS NOT NULL;
37
+ SELECT COUNT(*) as sections_with_fts_content FROM section_fts WHERE fts_content IS NOT NULL;
38
+ SELECT COUNT(*) as sections_with_fts_combined FROM section_fts WHERE fts_combined IS NOT NULL;
39
+
40
+ -- Test search
41
+ SELECT '=== Test Search for 老人情感 ===' as info;
42
+ SELECT section_id, section_title
43
+ FROM section_fts ssf
44
+ JOIN source_sections ss ON ss.id = ssf.section_id
45
+ WHERE ssf.fts_combined @@ plainto_tsquery('jiebacfg', '老人情感')
46
+ AND ssf.language = 'zh'
47
+ LIMIT 5;
48
+
49
+ COMMIT;
50
+
51
+ SELECT '=== Complete! All fulltext indexes rebuilt ===' as status;
@@ -0,0 +1,28 @@
1
+ -- Seed data for text_search_configs table
2
+ -- This file contains the standard language configurations for full-text search
3
+
4
+ -- Clear existing data
5
+ TRUNCATE TABLE text_search_configs;
6
+
7
+ -- Insert language configurations
8
+ INSERT INTO text_search_configs (language_code, config_name, is_installed) VALUES
9
+ ('en', 'pg_catalog.english', true),
10
+ ('en_us', 'pg_catalog.english', true),
11
+ ('en_gb', 'pg_catalog.english', true),
12
+ ('zh', 'jiebacfg', true),
13
+ ('zh_cn', 'jiebacfg', true),
14
+ ('zh_tw', 'jiebacfg', true),
15
+ ('ja', 'pg_catalog.simple', true),
16
+ ('ja_jp', 'pg_catalog.simple', true),
17
+ ('ko', 'pg_catalog.simple', true),
18
+ ('ko_kr', 'pg_catalog.simple', true),
19
+ ('es', 'pg_catalog.spanish', true),
20
+ ('fr', 'pg_catalog.french', true),
21
+ ('de', 'pg_catalog.german', true),
22
+ ('it', 'pg_catalog.italian', true),
23
+ ('ru', 'pg_catalog.russian', true),
24
+ ('ar', 'pg_catalog.simple', true),
25
+ ('default', 'pg_catalog.simple', true);
26
+
27
+ -- Add comment for jieba configuration
28
+ COMMENT ON COLUMN text_search_configs.config_name IS 'Full-text search configuration name. For Chinese support, ensure pg_jieba extension is installed';
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "common"
5
+
6
+ include Examples::Common
7
+
8
+ print_header("Quick Start")
9
+ smart_rag = build_client
10
+
11
+ stats = smart_rag.statistics
12
+ puts "SmartRAG initialized."
13
+ puts "Documents in DB: #{stats[:document_count]}"
14
+
15
+ query = ARGV[0] || "machine learning algorithms"
16
+ results = smart_rag.search(
17
+ query,
18
+ search_type: "hybrid",
19
+ limit: 5,
20
+ include_content: true,
21
+ )
22
+
23
+ print_header("First Search: #{query}")
24
+ results.fetch(:results, []).each_with_index do |result, idx|
25
+ score = result[:combined_score] || result[:similarity] || 0.0
26
+ puts "#{idx + 1}. #{result[:section_title]} (score: #{score.round(3)})"
27
+ next unless result[:content]
28
+
29
+ preview = result[:content][0, 120].to_s.gsub(/\s+/, " ")
30
+ puts " #{preview}..."
31
+ end
32
+
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "common"
5
+
6
+ include Examples::Common
7
+
8
+ print_header("Document Management")
9
+ smart_rag = build_client
10
+
11
+ document_path = ARGV[0]
12
+ if document_path.nil? || document_path.strip.empty?
13
+ warn "Usage: ruby examples/02_document_management.rb /path/to/document.md"
14
+ exit 1
15
+ end
16
+
17
+ add_result = smart_rag.add_document(
18
+ document_path,
19
+ title: File.basename(document_path),
20
+ generate_embeddings: true,
21
+ generate_tags: true,
22
+ tags: ["example", "usage_examples"],
23
+ metadata: { source: "examples/02_document_management.rb" },
24
+ )
25
+
26
+ print_json("Add Result", add_result)
27
+
28
+ document_id = add_result[:document_id]
29
+ detail = smart_rag.get_document(document_id)
30
+ print_json("Document Detail", detail || {})
31
+
32
+ list = smart_rag.list_documents(page: 1, per_page: 10, search: File.basename(document_path))
33
+ print_json("Document List", list)
34
+
35
+ # Pass DELETE=1 to demonstrate cleanup:
36
+ # DELETE=1 ruby examples/02_document_management.rb test/python_basics.md
37
+ if ENV["DELETE"] == "1"
38
+ delete_result = smart_rag.remove_document(document_id)
39
+ print_json("Delete Result", delete_result)
40
+ end
41
+
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "common"
5
+
6
+ include Examples::Common
7
+
8
+ print_header("Search Operations")
9
+ smart_rag = build_client
10
+
11
+ query = ARGV[0] || "deep learning applications in healthcare"
12
+
13
+ hybrid = smart_rag.search(
14
+ query,
15
+ search_type: "hybrid",
16
+ limit: 5,
17
+ alpha: 0.7,
18
+ include_metadata: true,
19
+ )
20
+ print_json("Hybrid Search Metadata", hybrid[:metadata] || {})
21
+
22
+ vector = smart_rag.vector_search(
23
+ "neural network architectures",
24
+ limit: 5,
25
+ include_content: false,
26
+ )
27
+ print_json("Vector Search Metadata", vector[:metadata] || {})
28
+
29
+ fulltext = smart_rag.fulltext_search(
30
+ '"deep reinforcement learning"',
31
+ limit: 5,
32
+ )
33
+ print_json("Fulltext Search Metadata", fulltext[:metadata] || {})
34
+
35
+ multilingual_queries = [
36
+ ["zh_cn", "人工智能应用"],
37
+ ["ja", "機械学習アルゴリズム"],
38
+ ["ko", "딥러닝 모델"],
39
+ ["auto", "AI和机器学习的发展"],
40
+ ]
41
+
42
+ multilingual_queries.each do |language, q|
43
+ result = smart_rag.search(q, search_type: "hybrid", language: language, limit: 3)
44
+ puts "[#{language}] #{q} -> #{result.fetch(:results, []).length} results"
45
+ end
46
+
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "common"
5
+
6
+ include Examples::Common
7
+
8
+ print_header("Topics And Tags")
9
+ smart_rag = build_client
10
+
11
+ topic = smart_rag.create_topic(
12
+ "AI in Healthcare",
13
+ "Applications of AI in medical diagnosis and treatment",
14
+ tags: ["ai", "healthcare", "diagnosis"],
15
+ document_ids: [],
16
+ )
17
+ print_json("Created Topic", topic)
18
+
19
+ topic_detail = smart_rag.get_topic(topic[:id])
20
+ print_json("Topic Detail", topic_detail || {})
21
+
22
+ all_topics = smart_rag.list_topics(page: 1, per_page: 20)
23
+ puts "Topic count on page: #{all_topics.fetch(:topics, []).length}"
24
+
25
+ recommendations = smart_rag.get_topic_recommendations(topic[:id], limit: 5)
26
+ print_json("Topic Recommendations", recommendations)
27
+
28
+ sample_text = <<~TEXT
29
+ Machine learning is a subset of artificial intelligence that enables systems
30
+ to learn and improve from experience without explicit programming.
31
+ TEXT
32
+
33
+ tags = smart_rag.generate_tags(sample_text, context: "AI introduction", max_tags: 5)
34
+ print_json("Generated Tags", tags)
35
+
36
+ tag_page = smart_rag.list_tags(page: 1, per_page: 20)
37
+ puts "Tag count on page: #{tag_page.fetch(:tags, []).length}"
38
+