smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
require_relative "model_base"
|
|
2
|
+
require "sequel/plugins/validation_helpers"
|
|
3
|
+
|
|
4
|
+
module SmartRAG
|
|
5
|
+
module Models
|
|
6
|
+
# SearchLog model for tracking search queries and performance
|
|
7
|
+
class SearchLog < Sequel::Model
|
|
8
|
+
# Set dataset after database is connected
|
|
9
|
+
def self.set_dataset_from_db
|
|
10
|
+
set_dataset(:search_logs)
|
|
11
|
+
end
|
|
12
|
+
include FactoryBotHelpers
|
|
13
|
+
plugin :validation_helpers
|
|
14
|
+
plugin :timestamps, update_on_create: false
|
|
15
|
+
|
|
16
|
+
# Add bang methods for FactoryBot compatibility
|
|
17
|
+
def self.create!(attributes = {})
|
|
18
|
+
instance = new(attributes)
|
|
19
|
+
instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
|
|
20
|
+
instance
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Validation
|
|
24
|
+
def validate
|
|
25
|
+
super
|
|
26
|
+
validates_presence :query
|
|
27
|
+
validates_max_length 10000, :query # Reasonable limit for query length
|
|
28
|
+
validates_includes %w[vector fulltext hybrid], :search_type, allow_nil: true
|
|
29
|
+
validates_integer :execution_time_ms, allow_nil: true, greater_than_or_equal_to: 0
|
|
30
|
+
validates_integer :results_count, allow_nil: true, greater_than_or_equal_to: 0
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Class methods
|
|
34
|
+
class << self
|
|
35
|
+
# Log a search query
|
|
36
|
+
def log(query:, search_type: nil, execution_time_ms: nil, results_count: nil,
|
|
37
|
+
query_vector: nil, result_ids: nil, filters: nil)
|
|
38
|
+
create(
|
|
39
|
+
query: query,
|
|
40
|
+
search_type: search_type,
|
|
41
|
+
execution_time_ms: execution_time_ms,
|
|
42
|
+
results_count: results_count,
|
|
43
|
+
query_vector: query_vector,
|
|
44
|
+
result_ids: result_ids,
|
|
45
|
+
filters: filters.is_a?(Hash) ? filters.to_json : filters
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Find logs by query type
|
|
50
|
+
def by_search_type(type)
|
|
51
|
+
where(search_type: type)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Get slow queries
|
|
55
|
+
def slow_queries(threshold_ms: 100)
|
|
56
|
+
where(Sequel.lit('execution_time_ms IS NOT NULL AND execution_time_ms > ?', threshold_ms))
|
|
57
|
+
.order(Sequel.desc(:execution_time_ms))
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get recent searches
|
|
61
|
+
def recent(limit: 50)
|
|
62
|
+
order(Sequel.desc(:created_at)).limit(limit)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Get popular queries (by frequency)
|
|
66
|
+
def popular(limit: 20)
|
|
67
|
+
group_and_count(:query)
|
|
68
|
+
.order(Sequel.desc(:count))
|
|
69
|
+
.limit(limit)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Get searches with no results
|
|
73
|
+
def with_no_results
|
|
74
|
+
where(results_count: 0).or(results_count: nil)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Get searches with many results
|
|
78
|
+
def with_many_results(threshold: 100)
|
|
79
|
+
where(Sequel.lit('results_count > ?', threshold))
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Get average execution time by search type
|
|
83
|
+
def average_execution_time_by_type
|
|
84
|
+
select(:search_type,
|
|
85
|
+
Sequel.function(:avg, :execution_time_ms).as(:avg_time),
|
|
86
|
+
Sequel.function(:count, :*).as(:count))
|
|
87
|
+
.where(Sequel.lit('execution_time_ms IS NOT NULL'))
|
|
88
|
+
.group(:search_type)
|
|
89
|
+
.order(Sequel.desc(:avg_time))
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Find similar queries (based on vector similarity)
|
|
93
|
+
def find_similar_queries(query_vector, limit: 10)
|
|
94
|
+
return [] unless query_vector
|
|
95
|
+
|
|
96
|
+
# Find queries with similar vectors and non-zero results
|
|
97
|
+
where(Sequel.lit('query_vector IS NOT NULL AND results_count > 0'))
|
|
98
|
+
.where(Sequel.lit('query_vector <=> ? < ?', query_vector.to_s, 0.3))
|
|
99
|
+
.order(Sequel.lit('query_vector <=> ?', query_vector.to_s))
|
|
100
|
+
.limit(limit)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Get search analytics (by time period)
|
|
104
|
+
def analytics_by_period(start_time:, end_time:)
|
|
105
|
+
where(created_at: start_time..end_time)
|
|
106
|
+
.select(
|
|
107
|
+
Sequel.function(:count, :*).as(:total_searches),
|
|
108
|
+
Sequel.function(:avg, :execution_time_ms).as(:avg_response_time),
|
|
109
|
+
Sequel.function(:sum, Sequel.lit("CASE WHEN results_count > 0 THEN 1 ELSE 0 END")).as(:successful_searches),
|
|
110
|
+
Sequel.function(:sum, Sequel.lit("CASE WHEN results_count = 0 OR results_count IS NULL THEN 1 ELSE 0 END")).as(:failed_searches)
|
|
111
|
+
)
|
|
112
|
+
.first
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Clean old logs (keep only last N days)
|
|
116
|
+
def cleanup(days_to_keep: 30)
|
|
117
|
+
cutoff_date = Time.now - (days_to_keep * 24 * 60 * 60)
|
|
118
|
+
where(Sequel.lit('created_at < ?', cutoff_date)).delete
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Export search logs
|
|
122
|
+
def export(start_time:, end_time:, format: :json)
|
|
123
|
+
logs = where(created_at: start_time..end_time).all
|
|
124
|
+
|
|
125
|
+
case format
|
|
126
|
+
when :json
|
|
127
|
+
logs.map(&:to_hash).to_json
|
|
128
|
+
when :csv
|
|
129
|
+
# Convert to CSV (simplified)
|
|
130
|
+
require 'csv'
|
|
131
|
+
CSV.generate do |csv|
|
|
132
|
+
csv << [:id, :query, :search_type, :execution_time_ms, :results_count, :created_at]
|
|
133
|
+
logs.each do |log|
|
|
134
|
+
csv << [log.id, log.query, log.search_type, log.execution_time_ms, log.results_count, log.created_at]
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
else
|
|
138
|
+
logs
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Instance methods
|
|
144
|
+
|
|
145
|
+
# Check if search was successful (had results)
|
|
146
|
+
def successful?
|
|
147
|
+
results_count && results_count > 0
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Check if search was slow
|
|
151
|
+
def slow?(threshold_ms: 100)
|
|
152
|
+
execution_time_ms && execution_time_ms > threshold_ms
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Get filters as hash
|
|
156
|
+
def filters_hash
|
|
157
|
+
begin
|
|
158
|
+
filters.is_a?(String) ? JSON.parse(filters) : filters
|
|
159
|
+
rescue
|
|
160
|
+
{}
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Get result IDs as array
|
|
165
|
+
def result_ids_array
|
|
166
|
+
return [] unless result_ids
|
|
167
|
+
result_ids.is_a?(String) ? JSON.parse(result_ids) : result_ids
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Query vector as array
|
|
171
|
+
def query_vector_array
|
|
172
|
+
return nil unless query_vector
|
|
173
|
+
query_vector.to_s.gsub(/[<>]/, '').split(',').map(&:to_f)
|
|
174
|
+
rescue
|
|
175
|
+
nil
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Log info hash
|
|
179
|
+
def info
|
|
180
|
+
{
|
|
181
|
+
id: id,
|
|
182
|
+
query: query,
|
|
183
|
+
search_type: search_type,
|
|
184
|
+
execution_time_ms: execution_time_ms,
|
|
185
|
+
results_count: results_count,
|
|
186
|
+
successful: successful?,
|
|
187
|
+
slow: slow?,
|
|
188
|
+
created_at: created_at
|
|
189
|
+
}
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# String representation
|
|
193
|
+
def to_s
|
|
194
|
+
"<SearchLog: #{id} - #{query[0..50]}#{query.length > 50 ? '...' : ''} (#{search_type})>"
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
require_relative "model_base"
|
|
2
|
+
require "sequel/plugins/validation_helpers"
|
|
3
|
+
|
|
4
|
+
module SmartRAG
|
|
5
|
+
module Models
|
|
6
|
+
# SectionFts model for full-text search optimization
|
|
7
|
+
class SectionFts < Sequel::Model(:section_fts)
|
|
8
|
+
include FactoryBotHelpers
|
|
9
|
+
plugin :validation_helpers
|
|
10
|
+
plugin :timestamps, update_on_create: true
|
|
11
|
+
|
|
12
|
+
# Add bang methods for FactoryBot compatibility
|
|
13
|
+
def self.create!(attributes = {})
|
|
14
|
+
instance = new(attributes)
|
|
15
|
+
instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
|
|
16
|
+
instance
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Relationships
|
|
20
|
+
many_to_one :section, class: '::SmartRAG::Models::SourceSection', key: :section_id
|
|
21
|
+
many_to_one :document, class: '::SmartRAG::Models::SourceDocument', key: :document_id
|
|
22
|
+
|
|
23
|
+
# Validation
|
|
24
|
+
def validate
|
|
25
|
+
super
|
|
26
|
+
validates_presence :section_id
|
|
27
|
+
validates_presence :language, allow_nil: true
|
|
28
|
+
validates_format /\A[a-z]{2}\z/, :language, allow_nil: true, message: 'must be ISO 639-1 code'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Class methods
|
|
32
|
+
class << self
|
|
33
|
+
# Full-text search using tsvector
|
|
34
|
+
def search(query, language: nil, fields: [:fts_combined], limit: 20)
|
|
35
|
+
# Ensure we have valid fields
|
|
36
|
+
valid_fields = [:fts_title, :fts_content, :fts_combined]
|
|
37
|
+
fields = Array(fields) & valid_fields
|
|
38
|
+
|
|
39
|
+
# Build the tsquery
|
|
40
|
+
tsquery = build_tsquery(query, language || 'simple')
|
|
41
|
+
|
|
42
|
+
# Build the search query
|
|
43
|
+
search_conditions = fields.map do |field|
|
|
44
|
+
Sequel.lit("#{field} @@ #{tsquery}")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Union of all field searches (OR condition)
|
|
48
|
+
where(Sequel.|(*search_conditions))
|
|
49
|
+
.order(Sequel.desc(Sequel.lit("ts_rank(#{fields.join(' || ')}, #{tsquery})")))
|
|
50
|
+
.limit(limit)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Find by document
|
|
54
|
+
def by_document(document_id)
|
|
55
|
+
where(document_id: document_id)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Find by section
|
|
59
|
+
def by_section(section_id)
|
|
60
|
+
where(section_id: section_id)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Find by language
|
|
64
|
+
def by_language(lang)
|
|
65
|
+
where(language: lang)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Custom ranking search
|
|
69
|
+
def search_with_ranking(query, language: nil, weights: '1.0, 0.5, 0.2', limit: 20)
|
|
70
|
+
tsquery = build_tsquery(query, language || 'simple')
|
|
71
|
+
|
|
72
|
+
select(:*,
|
|
73
|
+
Sequel.lit("ts_rank('{#{weights}}', fts_combined, #{tsquery})").as(:rank))
|
|
74
|
+
.where(Sequel.lit("fts_combined @@ #{tsquery}"))
|
|
75
|
+
.order(Sequel.desc(:rank))
|
|
76
|
+
.limit(limit)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Build tsquery from text
|
|
80
|
+
def build_tsquery(query, language = 'simple')
|
|
81
|
+
# Convert query to tsquery format
|
|
82
|
+
# Replace spaces with & for AND search
|
|
83
|
+
# Add * for prefix matching
|
|
84
|
+
terms = query.to_s.split.map { |term| "#{term}:*" }
|
|
85
|
+
terms.join(' & ')
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Find sections with fresh FTS data
|
|
89
|
+
def fresh(max_age: 3600)
|
|
90
|
+
# Sections updated within the last hour
|
|
91
|
+
where(Sequel.lit('updated_at > ?', Time.now - max_age))
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Find stale FTS entries
|
|
95
|
+
def stale
|
|
96
|
+
# Find sections without FTS or with old FTS
|
|
97
|
+
subquery = db[:source_sections]
|
|
98
|
+
.left_join(:section_fts, section_id: :id)
|
|
99
|
+
.where(Sequel.|(
|
|
100
|
+
{ Sequel[:section_fts][:section_id] => nil },
|
|
101
|
+
Sequel.lit('source_sections.updated_at > section_fts.updated_at')
|
|
102
|
+
))
|
|
103
|
+
.select(:source_sections__id)
|
|
104
|
+
|
|
105
|
+
where(section_id: subquery)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Create or update FTS entry
|
|
109
|
+
def create_or_update(section_id, attrs)
|
|
110
|
+
existing = find(section_id: section_id)
|
|
111
|
+
if existing
|
|
112
|
+
existing.update(attrs)
|
|
113
|
+
existing
|
|
114
|
+
else
|
|
115
|
+
create(attrs.merge(section_id: section_id))
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Instance methods
|
|
121
|
+
|
|
122
|
+
# Check if FTS data is up to date with section
|
|
123
|
+
def up_to_date?(section_updated_at)
|
|
124
|
+
return false unless updated_at
|
|
125
|
+
updated_at >= section_updated_at
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Get search vectors as hash
|
|
129
|
+
def vectors
|
|
130
|
+
{
|
|
131
|
+
title: fts_title,
|
|
132
|
+
content: fts_content,
|
|
133
|
+
combined: fts_combined
|
|
134
|
+
}
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Update search vectors
|
|
138
|
+
def update_vectors(title_vector, content_vector, combined_vector)
|
|
139
|
+
update(
|
|
140
|
+
fts_title: title_vector,
|
|
141
|
+
fts_content: content_vector,
|
|
142
|
+
fts_combined: combined_vector,
|
|
143
|
+
updated_at: Time.now
|
|
144
|
+
)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Get rank for a query
|
|
148
|
+
def rank_for(query, language: nil, weights: '1.0, 0.5, 0.2')
|
|
149
|
+
tsquery = self.class.build_tsquery(query, language || 'simple')
|
|
150
|
+
db[Sequel.lit("SELECT ts_rank('{#{weights}}', ?, ?) as rank", fts_combined, tsquery)].first[:rank]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Section FTS info
|
|
154
|
+
def info
|
|
155
|
+
{
|
|
156
|
+
section_id: section_id,
|
|
157
|
+
document_id: document_id,
|
|
158
|
+
language: language,
|
|
159
|
+
has_vectors: fts_combined.present?,
|
|
160
|
+
updated_at: updated_at
|
|
161
|
+
}
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# String representation
|
|
165
|
+
def to_s
|
|
166
|
+
"<SectionFts: section_id=#{section_id}>"
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
require_relative "model_base"
|
|
2
|
+
require "sequel/plugins/validation_helpers"
|
|
3
|
+
|
|
4
|
+
module SmartRAG
|
|
5
|
+
module Models
|
|
6
|
+
# SectionTag model for many-to-many relationship between sections and tags
|
|
7
|
+
class SectionTag < Sequel::Model(:section_tags)
|
|
8
|
+
include FactoryBotHelpers
|
|
9
|
+
plugin :validation_helpers
|
|
10
|
+
plugin :timestamps, update_on_create: false
|
|
11
|
+
|
|
12
|
+
# Allow mass assignment of primary keys
|
|
13
|
+
unrestrict_primary_key
|
|
14
|
+
|
|
15
|
+
# Add bang methods for FactoryBot compatibility
|
|
16
|
+
def self.create!(attributes = {})
|
|
17
|
+
instance = new(attributes)
|
|
18
|
+
instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
|
|
19
|
+
instance
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Relationships (explicit, though many_to_many is used in main models)
|
|
23
|
+
many_to_one :section, class: '::SmartRAG::Models::SourceSection', key: :section_id
|
|
24
|
+
many_to_one :tag, class: '::SmartRAG::Models::Tag', key: :tag_id
|
|
25
|
+
|
|
26
|
+
# Validation
|
|
27
|
+
def validate
|
|
28
|
+
super
|
|
29
|
+
validates_presence [:section_id, :tag_id]
|
|
30
|
+
validates_unique [:section_id, :tag_id]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Class methods
|
|
34
|
+
class << self
|
|
35
|
+
# Find by section and tag
|
|
36
|
+
def find_by_section_and_tag(section_id, tag_id)
|
|
37
|
+
where(section_id: section_id, tag_id: tag_id).first
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Get all tags for a section
|
|
41
|
+
def tags_for_section(section_id)
|
|
42
|
+
where(section_id: section_id).all
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Get all sections for a tag
|
|
46
|
+
def sections_for_tag(tag_id)
|
|
47
|
+
where(tag_id: tag_id).all
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Delete all tags for a section
|
|
51
|
+
def delete_all_for_section(section_id)
|
|
52
|
+
where(section_id: section_id).delete
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Delete all sections for a tag
|
|
56
|
+
def delete_all_for_tag(tag_id)
|
|
57
|
+
where(tag_id: tag_id).delete
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Bulk create associations
|
|
61
|
+
def bulk_create(associations)
|
|
62
|
+
db.transaction do
|
|
63
|
+
dataset.multi_insert(associations)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Check if section has a specific tag
|
|
68
|
+
def has_tag?(section_id, tag_id)
|
|
69
|
+
where(section_id: section_id, tag_id: tag_id).count > 0
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Instance methods
|
|
74
|
+
|
|
75
|
+
# String representation
|
|
76
|
+
def to_s
|
|
77
|
+
"<SectionTag: section:#{section_id} => tag:#{tag_id}>"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
require_relative "model_base"
|
|
2
|
+
require "sequel/plugins/validation_helpers"
|
|
3
|
+
|
|
4
|
+
module SmartRAG
|
|
5
|
+
module Models
|
|
6
|
+
# Source document model representing original documents
|
|
7
|
+
class SourceDocument < Sequel::Model
|
|
8
|
+
# Set dataset after database is connected
|
|
9
|
+
def self.set_dataset_from_db
|
|
10
|
+
set_dataset(:source_documents)
|
|
11
|
+
end
|
|
12
|
+
plugin :validation_helpers
|
|
13
|
+
plugin :timestamps, update_on_create: true
|
|
14
|
+
include FactoryBotHelpers
|
|
15
|
+
|
|
16
|
+
# Constants
|
|
17
|
+
DOWNLOAD_STATES = {
|
|
18
|
+
pending: 0,
|
|
19
|
+
completed: 1,
|
|
20
|
+
failed: 2
|
|
21
|
+
}.freeze
|
|
22
|
+
SOURCE_TYPES = %w[url file manual memory_snapshot other].freeze
|
|
23
|
+
|
|
24
|
+
# Relationships
|
|
25
|
+
one_to_many :sections, class: '::SmartRAG::Models::SourceSection', key: :document_id
|
|
26
|
+
one_to_many :section_fts, class: '::SmartRAG::Models::SectionFts', key: :document_id
|
|
27
|
+
one_to_many :section_tags, class: '::SmartRAG::Models::SectionTag', key: :section_id
|
|
28
|
+
|
|
29
|
+
# Validation
|
|
30
|
+
def validate
|
|
31
|
+
super
|
|
32
|
+
validates_presence :title
|
|
33
|
+
validates_integer :download_state, allow_nil: true
|
|
34
|
+
validates_includes DOWNLOAD_STATES.values, :download_state, allow_nil: true
|
|
35
|
+
validates_format /\A[a-z]{2}\z/, :language, allow_nil: true, message: 'must be ISO 639-1 code'
|
|
36
|
+
validates_includes SOURCE_TYPES, :source_type, allow_nil: true
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Class methods
|
|
40
|
+
class << self
|
|
41
|
+
# Find documents by download state
|
|
42
|
+
def by_download_state(state)
|
|
43
|
+
where(download_state: state)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Find completed documents
|
|
47
|
+
def completed
|
|
48
|
+
by_download_state(DOWNLOAD_STATES[:completed])
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Find pending documents
|
|
52
|
+
def pending
|
|
53
|
+
by_download_state(DOWNLOAD_STATES[:pending])
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Find failed documents
|
|
57
|
+
def failed
|
|
58
|
+
by_download_state(DOWNLOAD_STATES[:failed])
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Find documents by language
|
|
62
|
+
def by_language(lang)
|
|
63
|
+
where(language: lang)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Search documents by title or description
|
|
67
|
+
def search(query)
|
|
68
|
+
where(Sequel.like(:title, "%#{query}%"))
|
|
69
|
+
.or(Sequel.like(:description, "%#{query}%"))
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Order by publication date
|
|
73
|
+
def order_by_publication_date(direction = :desc)
|
|
74
|
+
order(Sequel.send(direction, :publication_date))
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Recent documents
|
|
78
|
+
def recent(days: 30)
|
|
79
|
+
where(Sequel.lit('publication_date >= ?', Date.today - days))
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Delete old documents and their sections
|
|
83
|
+
def delete_old_documents(days: 90)
|
|
84
|
+
cutoff_date = Time.now - (days * 24 * 60 * 60)
|
|
85
|
+
|
|
86
|
+
db.transaction do
|
|
87
|
+
# Delete related embeddings
|
|
88
|
+
db[:embeddings].where(source_id: db[:source_sections].select(:id).where(document_id: db.from(:source_documents).where(Sequel.lit('created_at < ?', cutoff_date)).select(:id))).delete
|
|
89
|
+
|
|
90
|
+
# Delete section FTS
|
|
91
|
+
db[:section_fts].where(document_id: db.from(:source_documents).where(Sequel.lit('created_at < ?', cutoff_date)).select(:id)).delete
|
|
92
|
+
|
|
93
|
+
# Delete sections
|
|
94
|
+
db[:source_sections].where(document_id: db.from(:source_documents).where(Sequel.lit('created_at < ?', cutoff_date)).select(:id)).delete
|
|
95
|
+
|
|
96
|
+
# Delete documents
|
|
97
|
+
where(Sequel.lit('created_at < ?', cutoff_date)).delete
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Create or update document
|
|
102
|
+
def create_or_update(attributes)
|
|
103
|
+
if attributes[:source_uri] && attributes[:content_hash]
|
|
104
|
+
existing = where(source_uri: attributes[:source_uri], content_hash: attributes[:content_hash]).first
|
|
105
|
+
if existing
|
|
106
|
+
existing.update(attributes)
|
|
107
|
+
return existing
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
if existing = find_by_url(attributes[:url])
|
|
112
|
+
existing.update(attributes)
|
|
113
|
+
existing
|
|
114
|
+
else
|
|
115
|
+
create(attributes)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Find by URL
|
|
120
|
+
def find_by_url(url)
|
|
121
|
+
where(url: url).first
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Find by multiple fields
|
|
125
|
+
def find_by_criteria(criteria)
|
|
126
|
+
query = self
|
|
127
|
+
criteria.each do |field, value|
|
|
128
|
+
query = query.where(field => value)
|
|
129
|
+
end
|
|
130
|
+
query.all
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Batch insert documents
|
|
134
|
+
def batch_insert(documents)
|
|
135
|
+
db.transaction do
|
|
136
|
+
dataset.multi_insert(documents)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Update download state
|
|
141
|
+
def update_download_state(id, state)
|
|
142
|
+
where(id: id).update(download_state: state, updated_at: Time.now)
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Instance methods
|
|
147
|
+
|
|
148
|
+
# Check if document is completed
|
|
149
|
+
def completed?
|
|
150
|
+
download_state == DOWNLOAD_STATES[:completed]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Check if document is pending
|
|
154
|
+
def pending?
|
|
155
|
+
download_state == DOWNLOAD_STATES[:pending]
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Check if document is failed
|
|
159
|
+
def failed?
|
|
160
|
+
download_state == DOWNLOAD_STATES[:failed]
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Set download state
|
|
164
|
+
def set_download_state(state)
|
|
165
|
+
update(download_state: DOWNLOAD_STATES[state])
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Get all sections with their embeddings
|
|
169
|
+
def sections_with_embeddings
|
|
170
|
+
sections.eager(:embedding).all
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Count sections
|
|
174
|
+
def section_count
|
|
175
|
+
sections.count
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Get document info hash
|
|
179
|
+
def info
|
|
180
|
+
{
|
|
181
|
+
id: id,
|
|
182
|
+
title: title,
|
|
183
|
+
url: url,
|
|
184
|
+
author: author,
|
|
185
|
+
publication_date: publication_date,
|
|
186
|
+
language: language,
|
|
187
|
+
description: description,
|
|
188
|
+
source_type: source_type,
|
|
189
|
+
source_uri: source_uri,
|
|
190
|
+
content_hash: content_hash,
|
|
191
|
+
download_state: download_state,
|
|
192
|
+
section_count: section_count,
|
|
193
|
+
created_at: created_at,
|
|
194
|
+
updated_at: updated_at
|
|
195
|
+
}
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# String representation
|
|
199
|
+
def to_s
|
|
200
|
+
"<SourceDocument: #{id} - #{title}>"
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|