smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,198 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # SearchLog model for tracking search queries and performance
7
+ class SearchLog < Sequel::Model
8
+ # Set dataset after database is connected
9
+ def self.set_dataset_from_db
10
+ set_dataset(:search_logs)
11
+ end
12
+ include FactoryBotHelpers
13
+ plugin :validation_helpers
14
+ plugin :timestamps, update_on_create: false
15
+
16
+ # Add bang methods for FactoryBot compatibility
17
+ def self.create!(attributes = {})
18
+ instance = new(attributes)
19
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
20
+ instance
21
+ end
22
+
23
+ # Validation
24
+ def validate
25
+ super
26
+ validates_presence :query
27
+ validates_max_length 10000, :query # Reasonable limit for query length
28
+ validates_includes %w[vector fulltext hybrid], :search_type, allow_nil: true
29
+ validates_integer :execution_time_ms, allow_nil: true, greater_than_or_equal_to: 0
30
+ validates_integer :results_count, allow_nil: true, greater_than_or_equal_to: 0
31
+ end
32
+
33
+ # Class methods
34
+ class << self
35
+ # Log a search query
36
+ def log(query:, search_type: nil, execution_time_ms: nil, results_count: nil,
37
+ query_vector: nil, result_ids: nil, filters: nil)
38
+ create(
39
+ query: query,
40
+ search_type: search_type,
41
+ execution_time_ms: execution_time_ms,
42
+ results_count: results_count,
43
+ query_vector: query_vector,
44
+ result_ids: result_ids,
45
+ filters: filters.is_a?(Hash) ? filters.to_json : filters
46
+ )
47
+ end
48
+
49
+ # Find logs by query type
50
+ def by_search_type(type)
51
+ where(search_type: type)
52
+ end
53
+
54
+ # Get slow queries
55
+ def slow_queries(threshold_ms: 100)
56
+ where(Sequel.lit('execution_time_ms IS NOT NULL AND execution_time_ms > ?', threshold_ms))
57
+ .order(Sequel.desc(:execution_time_ms))
58
+ end
59
+
60
+ # Get recent searches
61
+ def recent(limit: 50)
62
+ order(Sequel.desc(:created_at)).limit(limit)
63
+ end
64
+
65
+ # Get popular queries (by frequency)
66
+ def popular(limit: 20)
67
+ group_and_count(:query)
68
+ .order(Sequel.desc(:count))
69
+ .limit(limit)
70
+ end
71
+
72
+ # Get searches with no results
73
+ def with_no_results
74
+ where(results_count: 0).or(results_count: nil)
75
+ end
76
+
77
+ # Get searches with many results
78
+ def with_many_results(threshold: 100)
79
+ where(Sequel.lit('results_count > ?', threshold))
80
+ end
81
+
82
+ # Get average execution time by search type
83
+ def average_execution_time_by_type
84
+ select(:search_type,
85
+ Sequel.function(:avg, :execution_time_ms).as(:avg_time),
86
+ Sequel.function(:count, :*).as(:count))
87
+ .where(Sequel.lit('execution_time_ms IS NOT NULL'))
88
+ .group(:search_type)
89
+ .order(Sequel.desc(:avg_time))
90
+ end
91
+
92
+ # Find similar queries (based on vector similarity)
93
+ def find_similar_queries(query_vector, limit: 10)
94
+ return [] unless query_vector
95
+
96
+ # Find queries with similar vectors and non-zero results
97
+ where(Sequel.lit('query_vector IS NOT NULL AND results_count > 0'))
98
+ .where(Sequel.lit('query_vector <=> ? < ?', query_vector.to_s, 0.3))
99
+ .order(Sequel.lit('query_vector <=> ?', query_vector.to_s))
100
+ .limit(limit)
101
+ end
102
+
103
+ # Get search analytics (by time period)
104
+ def analytics_by_period(start_time:, end_time:)
105
+ where(created_at: start_time..end_time)
106
+ .select(
107
+ Sequel.function(:count, :*).as(:total_searches),
108
+ Sequel.function(:avg, :execution_time_ms).as(:avg_response_time),
109
+ Sequel.function(:sum, Sequel.lit("CASE WHEN results_count > 0 THEN 1 ELSE 0 END")).as(:successful_searches),
110
+ Sequel.function(:sum, Sequel.lit("CASE WHEN results_count = 0 OR results_count IS NULL THEN 1 ELSE 0 END")).as(:failed_searches)
111
+ )
112
+ .first
113
+ end
114
+
115
+ # Clean old logs (keep only last N days)
116
+ def cleanup(days_to_keep: 30)
117
+ cutoff_date = Time.now - (days_to_keep * 24 * 60 * 60)
118
+ where(Sequel.lit('created_at < ?', cutoff_date)).delete
119
+ end
120
+
121
+ # Export search logs
122
+ def export(start_time:, end_time:, format: :json)
123
+ logs = where(created_at: start_time..end_time).all
124
+
125
+ case format
126
+ when :json
127
+ logs.map(&:to_hash).to_json
128
+ when :csv
129
+ # Convert to CSV (simplified)
130
+ require 'csv'
131
+ CSV.generate do |csv|
132
+ csv << [:id, :query, :search_type, :execution_time_ms, :results_count, :created_at]
133
+ logs.each do |log|
134
+ csv << [log.id, log.query, log.search_type, log.execution_time_ms, log.results_count, log.created_at]
135
+ end
136
+ end
137
+ else
138
+ logs
139
+ end
140
+ end
141
+ end
142
+
143
+ # Instance methods
144
+
145
+ # Check if search was successful (had results)
146
+ def successful?
147
+ results_count && results_count > 0
148
+ end
149
+
150
+ # Check if search was slow
151
+ def slow?(threshold_ms: 100)
152
+ execution_time_ms && execution_time_ms > threshold_ms
153
+ end
154
+
155
+ # Get filters as hash
156
+ def filters_hash
157
+ begin
158
+ filters.is_a?(String) ? JSON.parse(filters) : filters
159
+ rescue
160
+ {}
161
+ end
162
+ end
163
+
164
+ # Get result IDs as array
165
+ def result_ids_array
166
+ return [] unless result_ids
167
+ result_ids.is_a?(String) ? JSON.parse(result_ids) : result_ids
168
+ end
169
+
170
+ # Query vector as array
171
+ def query_vector_array
172
+ return nil unless query_vector
173
+ query_vector.to_s.gsub(/[<>]/, '').split(',').map(&:to_f)
174
+ rescue
175
+ nil
176
+ end
177
+
178
+ # Log info hash
179
+ def info
180
+ {
181
+ id: id,
182
+ query: query,
183
+ search_type: search_type,
184
+ execution_time_ms: execution_time_ms,
185
+ results_count: results_count,
186
+ successful: successful?,
187
+ slow: slow?,
188
+ created_at: created_at
189
+ }
190
+ end
191
+
192
+ # String representation
193
+ def to_s
194
+ "<SearchLog: #{id} - #{query[0..50]}#{query.length > 50 ? '...' : ''} (#{search_type})>"
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,170 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # SectionFts model for full-text search optimization
7
+ class SectionFts < Sequel::Model(:section_fts)
8
+ include FactoryBotHelpers
9
+ plugin :validation_helpers
10
+ plugin :timestamps, update_on_create: true
11
+
12
+ # Add bang methods for FactoryBot compatibility
13
+ def self.create!(attributes = {})
14
+ instance = new(attributes)
15
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
16
+ instance
17
+ end
18
+
19
+ # Relationships
20
+ many_to_one :section, class: '::SmartRAG::Models::SourceSection', key: :section_id
21
+ many_to_one :document, class: '::SmartRAG::Models::SourceDocument', key: :document_id
22
+
23
+ # Validation
24
+ def validate
25
+ super
26
+ validates_presence :section_id
27
+ validates_presence :language, allow_nil: true
28
+ validates_format /\A[a-z]{2}\z/, :language, allow_nil: true, message: 'must be ISO 639-1 code'
29
+ end
30
+
31
+ # Class methods
32
+ class << self
33
+ # Full-text search using tsvector
34
+ def search(query, language: nil, fields: [:fts_combined], limit: 20)
35
+ # Ensure we have valid fields
36
+ valid_fields = [:fts_title, :fts_content, :fts_combined]
37
+ fields = Array(fields) & valid_fields
38
+
39
+ # Build the tsquery
40
+ tsquery = build_tsquery(query, language || 'simple')
41
+
42
+ # Build the search query
43
+ search_conditions = fields.map do |field|
44
+ Sequel.lit("#{field} @@ #{tsquery}")
45
+ end
46
+
47
+ # Union of all field searches (OR condition)
48
+ where(Sequel.|(*search_conditions))
49
+ .order(Sequel.desc(Sequel.lit("ts_rank(#{fields.join(' || ')}, #{tsquery})")))
50
+ .limit(limit)
51
+ end
52
+
53
+ # Find by document
54
+ def by_document(document_id)
55
+ where(document_id: document_id)
56
+ end
57
+
58
+ # Find by section
59
+ def by_section(section_id)
60
+ where(section_id: section_id)
61
+ end
62
+
63
+ # Find by language
64
+ def by_language(lang)
65
+ where(language: lang)
66
+ end
67
+
68
+ # Custom ranking search
69
+ def search_with_ranking(query, language: nil, weights: '1.0, 0.5, 0.2', limit: 20)
70
+ tsquery = build_tsquery(query, language || 'simple')
71
+
72
+ select(:*,
73
+ Sequel.lit("ts_rank('{#{weights}}', fts_combined, #{tsquery})").as(:rank))
74
+ .where(Sequel.lit("fts_combined @@ #{tsquery}"))
75
+ .order(Sequel.desc(:rank))
76
+ .limit(limit)
77
+ end
78
+
79
+ # Build tsquery from text
80
+ def build_tsquery(query, language = 'simple')
81
+ # Convert query to tsquery format
82
+ # Replace spaces with & for AND search
83
+ # Add * for prefix matching
84
+ terms = query.to_s.split.map { |term| "#{term}:*" }
85
+ terms.join(' & ')
86
+ end
87
+
88
+ # Find sections with fresh FTS data
89
+ def fresh(max_age: 3600)
90
+ # Sections updated within the last hour
91
+ where(Sequel.lit('updated_at > ?', Time.now - max_age))
92
+ end
93
+
94
+ # Find stale FTS entries
95
+ def stale
96
+ # Find sections without FTS or with old FTS
97
+ subquery = db[:source_sections]
98
+ .left_join(:section_fts, section_id: :id)
99
+ .where(Sequel.|(
100
+ { Sequel[:section_fts][:section_id] => nil },
101
+ Sequel.lit('source_sections.updated_at > section_fts.updated_at')
102
+ ))
103
+ .select(:source_sections__id)
104
+
105
+ where(section_id: subquery)
106
+ end
107
+
108
+ # Create or update FTS entry
109
+ def create_or_update(section_id, attrs)
110
+ existing = find(section_id: section_id)
111
+ if existing
112
+ existing.update(attrs)
113
+ existing
114
+ else
115
+ create(attrs.merge(section_id: section_id))
116
+ end
117
+ end
118
+ end
119
+
120
+ # Instance methods
121
+
122
+ # Check if FTS data is up to date with section
123
+ def up_to_date?(section_updated_at)
124
+ return false unless updated_at
125
+ updated_at >= section_updated_at
126
+ end
127
+
128
+ # Get search vectors as hash
129
+ def vectors
130
+ {
131
+ title: fts_title,
132
+ content: fts_content,
133
+ combined: fts_combined
134
+ }
135
+ end
136
+
137
+ # Update search vectors
138
+ def update_vectors(title_vector, content_vector, combined_vector)
139
+ update(
140
+ fts_title: title_vector,
141
+ fts_content: content_vector,
142
+ fts_combined: combined_vector,
143
+ updated_at: Time.now
144
+ )
145
+ end
146
+
147
+ # Get rank for a query
148
+ def rank_for(query, language: nil, weights: '1.0, 0.5, 0.2')
149
+ tsquery = self.class.build_tsquery(query, language || 'simple')
150
+ db[Sequel.lit("SELECT ts_rank('{#{weights}}', ?, ?) as rank", fts_combined, tsquery)].first[:rank]
151
+ end
152
+
153
+ # Section FTS info
154
+ def info
155
+ {
156
+ section_id: section_id,
157
+ document_id: document_id,
158
+ language: language,
159
+ has_vectors: fts_combined.present?,
160
+ updated_at: updated_at
161
+ }
162
+ end
163
+
164
+ # String representation
165
+ def to_s
166
+ "<SectionFts: section_id=#{section_id}>"
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,81 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # SectionTag model for many-to-many relationship between sections and tags
7
+ class SectionTag < Sequel::Model(:section_tags)
8
+ include FactoryBotHelpers
9
+ plugin :validation_helpers
10
+ plugin :timestamps, update_on_create: false
11
+
12
+ # Allow mass assignment of primary keys
13
+ unrestrict_primary_key
14
+
15
+ # Add bang methods for FactoryBot compatibility
16
+ def self.create!(attributes = {})
17
+ instance = new(attributes)
18
+ instance.save! || raise(Sequel::ValidationFailed, instance.errors.full_messages.join(", "))
19
+ instance
20
+ end
21
+
22
+ # Relationships (explicit, though many_to_many is used in main models)
23
+ many_to_one :section, class: '::SmartRAG::Models::SourceSection', key: :section_id
24
+ many_to_one :tag, class: '::SmartRAG::Models::Tag', key: :tag_id
25
+
26
+ # Validation
27
+ def validate
28
+ super
29
+ validates_presence [:section_id, :tag_id]
30
+ validates_unique [:section_id, :tag_id]
31
+ end
32
+
33
+ # Class methods
34
+ class << self
35
+ # Find by section and tag
36
+ def find_by_section_and_tag(section_id, tag_id)
37
+ where(section_id: section_id, tag_id: tag_id).first
38
+ end
39
+
40
+ # Get all tags for a section
41
+ def tags_for_section(section_id)
42
+ where(section_id: section_id).all
43
+ end
44
+
45
+ # Get all sections for a tag
46
+ def sections_for_tag(tag_id)
47
+ where(tag_id: tag_id).all
48
+ end
49
+
50
+ # Delete all tags for a section
51
+ def delete_all_for_section(section_id)
52
+ where(section_id: section_id).delete
53
+ end
54
+
55
+ # Delete all sections for a tag
56
+ def delete_all_for_tag(tag_id)
57
+ where(tag_id: tag_id).delete
58
+ end
59
+
60
+ # Bulk create associations
61
+ def bulk_create(associations)
62
+ db.transaction do
63
+ dataset.multi_insert(associations)
64
+ end
65
+ end
66
+
67
+ # Check if section has a specific tag
68
+ def has_tag?(section_id, tag_id)
69
+ where(section_id: section_id, tag_id: tag_id).count > 0
70
+ end
71
+ end
72
+
73
+ # Instance methods
74
+
75
+ # String representation
76
+ def to_s
77
+ "<SectionTag: section:#{section_id} => tag:#{tag_id}>"
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,204 @@
1
+ require_relative "model_base"
2
+ require "sequel/plugins/validation_helpers"
3
+
4
+ module SmartRAG
5
+ module Models
6
+ # Source document model representing original documents
7
+ class SourceDocument < Sequel::Model
8
+ # Set dataset after database is connected
9
+ def self.set_dataset_from_db
10
+ set_dataset(:source_documents)
11
+ end
12
+ plugin :validation_helpers
13
+ plugin :timestamps, update_on_create: true
14
+ include FactoryBotHelpers
15
+
16
+ # Constants
17
+ DOWNLOAD_STATES = {
18
+ pending: 0,
19
+ completed: 1,
20
+ failed: 2
21
+ }.freeze
22
+ SOURCE_TYPES = %w[url file manual memory_snapshot other].freeze
23
+
24
+ # Relationships
25
+ one_to_many :sections, class: '::SmartRAG::Models::SourceSection', key: :document_id
26
+ one_to_many :section_fts, class: '::SmartRAG::Models::SectionFts', key: :document_id
27
+ one_to_many :section_tags, class: '::SmartRAG::Models::SectionTag', key: :section_id
28
+
29
+ # Validation
30
+ def validate
31
+ super
32
+ validates_presence :title
33
+ validates_integer :download_state, allow_nil: true
34
+ validates_includes DOWNLOAD_STATES.values, :download_state, allow_nil: true
35
+ validates_format /\A[a-z]{2}\z/, :language, allow_nil: true, message: 'must be ISO 639-1 code'
36
+ validates_includes SOURCE_TYPES, :source_type, allow_nil: true
37
+ end
38
+
39
+ # Class methods
40
+ class << self
41
+ # Find documents by download state
42
+ def by_download_state(state)
43
+ where(download_state: state)
44
+ end
45
+
46
+ # Find completed documents
47
+ def completed
48
+ by_download_state(DOWNLOAD_STATES[:completed])
49
+ end
50
+
51
+ # Find pending documents
52
+ def pending
53
+ by_download_state(DOWNLOAD_STATES[:pending])
54
+ end
55
+
56
+ # Find failed documents
57
+ def failed
58
+ by_download_state(DOWNLOAD_STATES[:failed])
59
+ end
60
+
61
+ # Find documents by language
62
+ def by_language(lang)
63
+ where(language: lang)
64
+ end
65
+
66
+ # Search documents by title or description
67
+ def search(query)
68
+ where(Sequel.like(:title, "%#{query}%"))
69
+ .or(Sequel.like(:description, "%#{query}%"))
70
+ end
71
+
72
+ # Order by publication date
73
+ def order_by_publication_date(direction = :desc)
74
+ order(Sequel.send(direction, :publication_date))
75
+ end
76
+
77
+ # Recent documents
78
+ def recent(days: 30)
79
+ where(Sequel.lit('publication_date >= ?', Date.today - days))
80
+ end
81
+
82
+ # Delete old documents and their sections
83
+ def delete_old_documents(days: 90)
84
+ cutoff_date = Time.now - (days * 24 * 60 * 60)
85
+
86
+ db.transaction do
87
+ # Delete related embeddings
88
+ db[:embeddings].where(source_id: db[:source_sections].select(:id).where(document_id: db.from(:source_documents).where(Sequel.lit('created_at < ?', cutoff_date)).select(:id))).delete
89
+
90
+ # Delete section FTS
91
+ db[:section_fts].where(document_id: db.from(:source_documents).where(Sequel.lit('created_at < ?', cutoff_date)).select(:id)).delete
92
+
93
+ # Delete sections
94
+ db[:source_sections].where(document_id: db.from(:source_documents).where(Sequel.lit('created_at < ?', cutoff_date)).select(:id)).delete
95
+
96
+ # Delete documents
97
+ where(Sequel.lit('created_at < ?', cutoff_date)).delete
98
+ end
99
+ end
100
+
101
+ # Create or update document
102
+ def create_or_update(attributes)
103
+ if attributes[:source_uri] && attributes[:content_hash]
104
+ existing = where(source_uri: attributes[:source_uri], content_hash: attributes[:content_hash]).first
105
+ if existing
106
+ existing.update(attributes)
107
+ return existing
108
+ end
109
+ end
110
+
111
+ if existing = find_by_url(attributes[:url])
112
+ existing.update(attributes)
113
+ existing
114
+ else
115
+ create(attributes)
116
+ end
117
+ end
118
+
119
+ # Find by URL
120
+ def find_by_url(url)
121
+ where(url: url).first
122
+ end
123
+
124
+ # Find by multiple fields
125
+ def find_by_criteria(criteria)
126
+ query = self
127
+ criteria.each do |field, value|
128
+ query = query.where(field => value)
129
+ end
130
+ query.all
131
+ end
132
+
133
+ # Batch insert documents
134
+ def batch_insert(documents)
135
+ db.transaction do
136
+ dataset.multi_insert(documents)
137
+ end
138
+ end
139
+
140
+ # Update download state
141
+ def update_download_state(id, state)
142
+ where(id: id).update(download_state: state, updated_at: Time.now)
143
+ end
144
+ end
145
+
146
+ # Instance methods
147
+
148
+ # Check if document is completed
149
+ def completed?
150
+ download_state == DOWNLOAD_STATES[:completed]
151
+ end
152
+
153
+ # Check if document is pending
154
+ def pending?
155
+ download_state == DOWNLOAD_STATES[:pending]
156
+ end
157
+
158
+ # Check if document is failed
159
+ def failed?
160
+ download_state == DOWNLOAD_STATES[:failed]
161
+ end
162
+
163
+ # Set download state
164
+ def set_download_state(state)
165
+ update(download_state: DOWNLOAD_STATES[state])
166
+ end
167
+
168
+ # Get all sections with their embeddings
169
+ def sections_with_embeddings
170
+ sections.eager(:embedding).all
171
+ end
172
+
173
+ # Count sections
174
+ def section_count
175
+ sections.count
176
+ end
177
+
178
+ # Get document info hash
179
+ def info
180
+ {
181
+ id: id,
182
+ title: title,
183
+ url: url,
184
+ author: author,
185
+ publication_date: publication_date,
186
+ language: language,
187
+ description: description,
188
+ source_type: source_type,
189
+ source_uri: source_uri,
190
+ content_hash: content_hash,
191
+ download_state: download_state,
192
+ section_count: section_count,
193
+ created_at: created_at,
194
+ updated_at: updated_at
195
+ }
196
+ end
197
+
198
+ # String representation
199
+ def to_s
200
+ "<SourceDocument: #{id} - #{title}>"
201
+ end
202
+ end
203
+ end
204
+ end