smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,140 @@
1
+ require_relative 'model_base'
2
+ require 'sequel/plugins/validation_helpers'
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # Embedding model for storing vector embeddings of document sections
7
+ class Embedding < Sequel::Model
8
+ include FactoryBotHelpers
9
+ plugin :validation_helpers
10
+ plugin :timestamps
11
+
12
+ # Set dataset after database is connected
13
+ def self.set_dataset_from_db
14
+ set_dataset(:embeddings)
15
+ end
16
+
17
+ # Add bang methods for FactoryBot compatibility
18
+ def self.create!(attributes = {})
19
+ instance = new(attributes)
20
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(', '))
21
+ instance
22
+ end
23
+
24
+ # Relationships
25
+ many_to_one :section, class: '::SmartRAG::Models::SourceSection', key: :source_id
26
+
27
+ # Validation
28
+ def validate
29
+ super
30
+ validates_presence %i[source_id vector]
31
+ end
32
+
33
+ # Class methods
34
+ class << self
35
+ # Find embeddings by source section
36
+ def by_section(section_id)
37
+ where(source_id: section_id).all
38
+ end
39
+
40
+ # Find embeddings by multiple sections
41
+ def by_sections(section_ids)
42
+ where(source_id: section_ids).all
43
+ end
44
+
45
+ # Find similar embeddings using cosine distance (pgvector)
46
+ def similar_to(query_vector, limit: 10, threshold: 0.3)
47
+ server_version = db.server_version
48
+
49
+ # Format vector for pgvector
50
+ formatted_vector = if query_vector.is_a?(Array)
51
+ "[#{query_vector.join(',')}]"
52
+ else
53
+ query_vector.to_s
54
+ end
55
+
56
+ distance_threshold = 1 - threshold
57
+
58
+ dataset = if server_version >= 120_000 # PostgreSQL 12+
59
+ where(Sequel.lit('(vector <=> ?) < ?', formatted_vector, distance_threshold))
60
+ .order(Sequel.lit('vector <=> ?', formatted_vector))
61
+ .limit(limit)
62
+ else
63
+ where(Sequel.lit('cosine_distance(vector, ?) < ?', formatted_vector, distance_threshold))
64
+ .order(Sequel.lit('cosine_distance(vector, ?)', formatted_vector))
65
+ .limit(limit)
66
+ end
67
+
68
+ dataset.all
69
+ end
70
+
71
+ # Find nearest embeddings without a threshold fallback
72
+ def nearest_to(query_vector, limit: 10)
73
+ server_version = db.server_version
74
+
75
+ formatted_vector = if query_vector.is_a?(Array)
76
+ "[#{query_vector.join(',')}]"
77
+ else
78
+ query_vector.to_s
79
+ end
80
+
81
+ dataset = if server_version >= 120_000 # PostgreSQL 12+
82
+ order(Sequel.lit('vector <=> ?', formatted_vector))
83
+ .limit(limit)
84
+ else
85
+ order(Sequel.lit('cosine_distance(vector, ?)', formatted_vector))
86
+ .limit(limit)
87
+ end
88
+
89
+ dataset.all
90
+ end
91
+
92
+ # Batch insert embeddings
93
+ def batch_insert(embedding_data)
94
+ db.transaction do
95
+ dataset.multi_insert(embedding_data)
96
+ end
97
+ end
98
+
99
+ # Delete embeddings by section
100
+ def delete_by_section(section_id)
101
+ where(source_id: section_id).delete
102
+ end
103
+
104
+ # Delete old embeddings (cleanup)
105
+ def delete_old_embeddings(days: 30)
106
+ where(Sequel.lit('created_at < ?', Time.now - (days * 24 * 60 * 60))).delete
107
+ end
108
+ end
109
+
110
+ # Instance methods
111
+
112
+ # Return vector as array of floats
113
+ def vector_array
114
+ return nil unless vector
115
+
116
+ # Convert pgvector to array
117
+ vector.to_s.gsub(/[<>]/, '').split(',').map(&:to_f)
118
+ end
119
+
120
+ # Calculate similarity to another vector
121
+ def similarity_to(other_vector)
122
+ vector_array = self.vector_array
123
+ vector_array_cosine_similarity(vector_array, other_vector)
124
+ end
125
+
126
+ private
127
+
128
+ def vector_array_cosine_similarity(v1, v2)
129
+ return 0.0 if v1.nil? || v2.nil? || v1.empty? || v2.empty?
130
+
131
+ dot_product = v1.zip(v2).map { |a, b| a * b }.sum
132
+ magnitude1 = Math.sqrt(v1.map { |x| x * x }.sum)
133
+ magnitude2 = Math.sqrt(v2.map { |x| x * x }.sum)
134
+ return 0.0 if magnitude1 == 0 || magnitude2 == 0
135
+
136
+ dot_product / (magnitude1 * magnitude2)
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,106 @@
1
+ require 'sequel'
2
+
3
+ module SmartRAG
4
+ module Models
5
+ # Module to add FactoryBot compatibility methods to Sequel models
6
+ module FactoryBotHelpers
7
+ # Save! method for compatibility with FactoryBot and ActiveRecord style
8
+ def save!(*args)
9
+ save(*args) || raise(Sequel::ValidationFailed, errors.full_messages.join(', '))
10
+ end
11
+
12
+ # Create! class method for compatibility with FactoryBot
13
+ def self.included(base)
14
+ base.class_eval do
15
+ def self.create!(attributes = {})
16
+ instance = new(attributes)
17
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(', '))
18
+ instance
19
+ end
20
+ end
21
+ end
22
+
23
+ def to_hash
24
+ values
25
+ end
26
+
27
+ def to_json(*args)
28
+ to_hash.to_json(*args)
29
+ end
30
+ end
31
+
32
+ # Stub database class that allows models to be loaded without connection
33
+ class DelayedConnection
34
+ # Allow any method calls during model class definition
35
+ def method_missing(method_name, *args, &block)
36
+ # Return a stub object that accepts any method call
37
+ # This allows Sequel's internal setup to proceed
38
+ StubDataset.new(self)
39
+ end
40
+
41
+ def respond_to_missing?(method_name, include_private = false)
42
+ true # Pretend to respond to everything
43
+ end
44
+
45
+ def kind_of?(other)
46
+ other == Sequel::Database || super
47
+ end
48
+
49
+ # Sequel needs these methods during model class definition
50
+ def schema(*args)
51
+ [] # Return empty schema
52
+ end
53
+
54
+ def tables(*args)
55
+ [] # Return empty tables list
56
+ end
57
+
58
+ def transaction(*args)
59
+ yield
60
+ end
61
+
62
+ def class_scope(*args)
63
+ self
64
+ end
65
+
66
+ def from(*args)
67
+ StubDataset.new(self)
68
+ end
69
+ end
70
+
71
+ # Stub dataset class for delayed connection
72
+ class StubDataset
73
+ def initialize(db)
74
+ @db = db
75
+ end
76
+
77
+ def method_missing(method_name, *args, **kwargs, &block)
78
+ # Return self to chain calls
79
+ self
80
+ end
81
+
82
+ def respond_to_missing?(method_name, include_private = false)
83
+ true
84
+ end
85
+
86
+ def clone(*args, **kwargs)
87
+ self
88
+ end
89
+ end
90
+
91
+ # Set a stub database connection so Sequel::Model subclasses can be defined
92
+ # This will be replaced with a real connection when SmartRAG::Models.db= is called
93
+ begin
94
+ Sequel::Model.db
95
+ rescue Sequel::Error
96
+ # No database set yet, set our stub connection
97
+ Sequel::Model.db = DelayedConnection.new
98
+ end
99
+
100
+ # Base class placeholder - models inherit directly from Sequel::Model
101
+ # This class is just for organization and documentation
102
+ module ModelBase
103
+ end
104
+ end
105
+ end
106
+
@@ -0,0 +1,171 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # ResearchTopic model for organizing content by topics
7
+ class ResearchTopic < Sequel::Model
8
+ # Set dataset after database is connected
9
+ def self.set_dataset_from_db
10
+ set_dataset(:research_topics)
11
+ end
12
+ include FactoryBotHelpers
13
+ plugin :validation_helpers
14
+ plugin :timestamps, update_on_create: true
15
+
16
+ # Add bang methods for FactoryBot compatibility
17
+ def self.create!(attributes = {})
18
+ instance = new(attributes)
19
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
20
+ instance
21
+ end
22
+
23
+ # Relationships
24
+ one_to_many :research_topic_sections, class: '::SmartRAG::Models::ResearchTopicSection', key: :research_topic_id
25
+ one_to_many :research_topic_tags, class: '::SmartRAG::Models::ResearchTopicTag', key: :research_topic_id
26
+ many_to_many :sections, class: '::SmartRAG::Models::SourceSection',
27
+ join_table: :research_topic_sections,
28
+ left_key: :research_topic_id,
29
+ right_key: :section_id
30
+ many_to_many :tags, class: '::SmartRAG::Models::Tag',
31
+ join_table: :research_topic_tags,
32
+ left_key: :research_topic_id,
33
+ right_key: :tag_id
34
+
35
+ # Validation
36
+ def validate
37
+ super
38
+ validates_presence :name
39
+ validates_max_length 500, :name
40
+ validates_presence :description, allow_nil: true
41
+ end
42
+
43
+ # Class methods
44
+ class << self
45
+ # Find topic by name
46
+ def find_by_name(name)
47
+ where(Sequel.ilike(:name, name)).first
48
+ end
49
+
50
+ # Search topics by name or description
51
+ def search(query)
52
+ where(Sequel.ilike(:name, "%#{query}%"))
53
+ .or(Sequel.ilike(:description, "%#{query}%"))
54
+ end
55
+
56
+ # Get topics with section count
57
+ def with_section_count
58
+ select(Sequel[:research_topics].*).select_append(Sequel.function(:count, :research_topic_sections__section_id).as(:section_count))
59
+ .left_join(:research_topic_sections, research_topic_id: :id)
60
+ .group(:research_topics__id)
61
+ .order(Sequel.desc(:section_count))
62
+ end
63
+
64
+ # Get topics by tag
65
+ def by_tag(tag_id)
66
+ where(id: db[:research_topic_tags].select(:research_topic_id).where(tag_id: tag_id))
67
+ end
68
+
69
+ # Get recently used topics
70
+ def recent(limit: 10)
71
+ order(Sequel.desc(:created_at)).limit(limit)
72
+ end
73
+
74
+ # Batch create topics
75
+ def batch_create(topics)
76
+ db.transaction do
77
+ topics.map { |topic_data| create(topic_data) }
78
+ end
79
+ end
80
+ end
81
+
82
+ # Instance methods
83
+
84
+ # Add section to topic
85
+ def add_section(section)
86
+ unless sections.include?(section)
87
+ self.add_section(section)
88
+ end
89
+ end
90
+
91
+ # Remove section from topic
92
+ def remove_section(section)
93
+ if sections.include?(section)
94
+ self.remove_section(section)
95
+ end
96
+ end
97
+
98
+ # Add tag to topic
99
+ def add_tag(tag)
100
+ unless tags.include?(tag)
101
+ self.add_tag(tag)
102
+ end
103
+ end
104
+
105
+ # Remove tag from topic
106
+ def remove_tag(tag)
107
+ if tags.include?(tag)
108
+ self.remove_tag(tag)
109
+ end
110
+ end
111
+
112
+ # Count sections for this topic
113
+ def section_count
114
+ sections.count
115
+ end
116
+
117
+ # Count tags for this topic
118
+ def tag_count
119
+ tags.count
120
+ end
121
+
122
+ # Get related topics (share sections or tags)
123
+ def related_topics(limit: 5)
124
+ topic_ids = db[:research_topic_sections]
125
+ .select(:research_topic_id)
126
+ .where(section_id: sections.map(&:id))
127
+ .where.not(research_topic_id: id)
128
+ .group(:research_topic_id)
129
+ .order(Sequel.desc(Sequel.function(:count, :*)))
130
+ .limit(limit)
131
+
132
+ self.class.where(id: topic_ids).all
133
+ end
134
+
135
+ # Get topics info
136
+ def info
137
+ {
138
+ id: id,
139
+ name: name,
140
+ section_count: section_count,
141
+ tag_count: tag_count,
142
+ created_at: created_at
143
+ }
144
+ end
145
+
146
+ # String representation
147
+ def to_s
148
+ "<ResearchTopic: #{id} - #{name} (#{section_count} sections, #{tag_count} tags)>"
149
+ end
150
+
151
+ # Alias name as title for API compatibility
152
+ def title
153
+ name
154
+ end
155
+
156
+ def title=(value)
157
+ self.name = value
158
+ end
159
+
160
+ # Alias created_at as updated_at for API compatibility
161
+ def updated_at
162
+ @updated_at || created_at
163
+ end
164
+
165
+ # Allow updated_at= for API compatibility (stored in memory only, not DB)
166
+ def updated_at=(value)
167
+ @updated_at = value
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,86 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # ResearchTopicSection model for many-to-many relationship
7
+ class ResearchTopicSection < Sequel::Model(:research_topic_sections)
8
+ include FactoryBotHelpers
9
+ plugin :validation_helpers
10
+ plugin :timestamps, update_on_create: false
11
+
12
+ # Allow mass assignment of primary keys
13
+ unrestrict_primary_key
14
+
15
+ # Add bang methods for FactoryBot compatibility
16
+ def self.create!(attributes = {})
17
+ instance = new(attributes)
18
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
19
+ instance
20
+ end
21
+
22
+ # Relationships
23
+ many_to_one :research_topic, class: '::SmartRAG::Models::ResearchTopic', key: :research_topic_id
24
+ many_to_one :section, class: '::SmartRAG::Models::SourceSection', key: :section_id
25
+
26
+ # Validation
27
+ def validate
28
+ super
29
+ validates_presence [:research_topic_id, :section_id]
30
+ validates_unique [:research_topic_id, :section_id]
31
+ end
32
+
33
+ # Class methods
34
+ class << self
35
+ # Find by topic and section
36
+ def find_by_topic_and_section(topic_id, section_id)
37
+ where(research_topic_id: topic_id, section_id: section_id).first
38
+ end
39
+
40
+ # Get all sections for a topic
41
+ def sections_for_topic(topic_id)
42
+ where(research_topic_id: topic_id).all
43
+ end
44
+
45
+ # Get all topics for a section
46
+ def topics_for_section(section_id)
47
+ where(section_id: section_id).all
48
+ end
49
+
50
+ # Delete all sections for a topic
51
+ def delete_all_for_topic(topic_id)
52
+ where(research_topic_id: topic_id).delete
53
+ end
54
+
55
+ # Delete all topics for a section
56
+ def delete_all_for_section(section_id)
57
+ where(section_id: section_id).delete
58
+ end
59
+
60
+ # Bulk create associations
61
+ def bulk_create(associations)
62
+ db.transaction do
63
+ dataset.multi_insert(associations)
64
+ end
65
+ end
66
+
67
+ # Check if section belongs to topic
68
+ def in_topic?(topic_id, section_id)
69
+ where(research_topic_id: topic_id, section_id: section_id).count > 0
70
+ end
71
+
72
+ # Get recent associations
73
+ def recent(limit: 50)
74
+ order(Sequel.desc(:created_at)).limit(limit)
75
+ end
76
+ end
77
+
78
+ # Instance methods
79
+
80
+ # String representation
81
+ def to_s
82
+ "<ResearchTopicSection: topic:#{research_topic_id} => section:#{section_id}>"
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,89 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # ResearchTopicTag model for many-to-many relationship between topics and tags
7
+ class ResearchTopicTag < Sequel::Model(:research_topic_tags)
8
+ include FactoryBotHelpers
9
+ plugin :validation_helpers
10
+ plugin :timestamps, update_on_create: false
11
+ # Allow mass assignment of composite primary key
12
+ unrestrict_primary_key
13
+
14
+ # Add bang methods for FactoryBot compatibility
15
+ def self.create!(attributes = {})
16
+ instance = new(attributes)
17
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
18
+ instance
19
+ end
20
+
21
+ # Relationships
22
+ many_to_one :research_topic, class: '::SmartRAG::Models::ResearchTopic', key: :research_topic_id
23
+ many_to_one :tag, class: '::SmartRAG::Models::Tag', key: :tag_id
24
+
25
+ # Validation
26
+ def validate
27
+ super
28
+ validates_presence [:research_topic_id, :tag_id]
29
+ validates_unique [:research_topic_id, :tag_id]
30
+ end
31
+
32
+ # Class methods
33
+ class << self
34
+ # Find by topic and tag
35
+ def find_by_topic_and_tag(topic_id, tag_id)
36
+ where(research_topic_id: topic_id, tag_id: tag_id).first
37
+ end
38
+
39
+ # Get all tags for a topic
40
+ def tags_for_topic(topic_id)
41
+ where(research_topic_id: topic_id).all
42
+ end
43
+
44
+ # Get all topics for a tag
45
+ def topics_for_tag(tag_id)
46
+ where(tag_id: tag_id).all
47
+ end
48
+
49
+ # Delete all tags for a topic
50
+ def delete_all_for_topic(topic_id)
51
+ where(research_topic_id: topic_id).delete
52
+ end
53
+
54
+ # Delete all topics for a tag
55
+ def delete_all_for_tag(tag_id)
56
+ where(tag_id: tag_id).delete
57
+ end
58
+
59
+ # Bulk create associations
60
+ def bulk_create(associations)
61
+ db.transaction do
62
+ dataset.multi_insert(associations)
63
+ end
64
+ end
65
+
66
+ # Check if topic has a specific tag
67
+ def has_tag?(topic_id, tag_id)
68
+ where(research_topic_id: topic_id, tag_id: tag_id).count > 0
69
+ end
70
+
71
+ # Get popular tags for topics
72
+ def popular_tags(limit: 20)
73
+ db[:research_topic_tags]
74
+ .select(:tag_id, Sequel.function(:count, :research_topic_id).as(:topic_count))
75
+ .group(:tag_id)
76
+ .order(Sequel.desc(:topic_count))
77
+ .limit(limit)
78
+ end
79
+ end
80
+
81
+ # Instance methods
82
+
83
+ # String representation
84
+ def to_s
85
+ "<ResearchTopicTag: topic:#{research_topic_id} => tag:#{tag_id}>"
86
+ end
87
+ end
88
+ end
89
+ end