smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
require_relative "../models/tag"
|
|
2
|
+
require_relative "../models/section_tag"
|
|
3
|
+
require_relative "../models/source_section"
|
|
4
|
+
require_relative "../errors"
|
|
5
|
+
require "smart_prompt"
|
|
6
|
+
|
|
7
|
+
module SmartRAG
|
|
8
|
+
module Services
|
|
9
|
+
# Service for managing tags, including generation, hierarchy, and content associations
|
|
10
|
+
class TagService
|
|
11
|
+
attr_reader :config, :smart_prompt_engine, :logger
|
|
12
|
+
|
|
13
|
+
# Initialize the tag service
|
|
14
|
+
# @param config [Hash] Configuration options
|
|
15
|
+
# @option config [String] :config_path Path to smart_prompt config (default: config/llm_config.yml)
|
|
16
|
+
# @option config [Integer] :max_retries Maximum retries for API calls (default: 3)
|
|
17
|
+
# @option config [Integer] :timeout Timeout for API calls (default: 30)
|
|
18
|
+
# @option config [Logger] :logger Logger instance (default: Logger.new(STDOUT))
|
|
19
|
+
def initialize(config = {})
|
|
20
|
+
config ||= {}
|
|
21
|
+
@logger = Logger.new(STDOUT)
|
|
22
|
+
@config = default_config.merge(config)
|
|
23
|
+
@logger = @config[:logger] || @logger
|
|
24
|
+
|
|
25
|
+
# Load workers
|
|
26
|
+
workers_dir = File.join(File.dirname(__FILE__), "..", "..", "..", "workers")
|
|
27
|
+
Dir.glob(File.join(workers_dir, "*.rb")).each { |file| require file }
|
|
28
|
+
|
|
29
|
+
# Initialize SmartPrompt engine
|
|
30
|
+
config_path = @config[:config_path] || "config/llm_config.yml"
|
|
31
|
+
@smart_prompt_engine = SmartPrompt::Engine.new(config_path)
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
log_error("Failed to initialize TagService", e)
|
|
34
|
+
raise
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Validate input for tag generation
|
|
38
|
+
# @param text [String] The text to validate
|
|
39
|
+
# @raise [ArgumentError] if text is nil or empty
|
|
40
|
+
def validate_input!(text)
|
|
41
|
+
raise ArgumentError, "Text cannot be empty" if text.to_s.strip.empty?
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Generate tags for text content using LLM
|
|
45
|
+
# @param text [String] Text content to analyze
|
|
46
|
+
# @param topic [String] Topic/context for the text (optional)
|
|
47
|
+
# @param languages [Array<Symbol>] Target languages for tags (e.g., [:zh_cn, :en])
|
|
48
|
+
# @param options [Hash] Additional options
|
|
49
|
+
# @option options [Boolean] :include_category Include category tags (default: true)
|
|
50
|
+
# @option options [Boolean] :include_content Include content tags (default: true)
|
|
51
|
+
# @option options [Integer] :max_category_tags Maximum category tags (default: 5)
|
|
52
|
+
# @option options [Integer] :max_content_tags Maximum content tags (default: 10)
|
|
53
|
+
# @return [Hash] Generated tags structure with categories and content tags
|
|
54
|
+
# @example
|
|
55
|
+
# generate_tags("Machine learning algorithms...", topic: "AI Research")
|
|
56
|
+
# # => {
|
|
57
|
+
# # categories: ["Machine Learning", "Artificial Intelligence"],
|
|
58
|
+
# # content_tags: ["Neural Networks", "Deep Learning", "Training Data"]
|
|
59
|
+
# # }
|
|
60
|
+
def generate_tags(text, topic = nil, languages = [:zh_cn, :en], options = {})
|
|
61
|
+
# Validate input - this will raise ArgumentError if empty, which is expected by tests
|
|
62
|
+
validate_input!(text)
|
|
63
|
+
|
|
64
|
+
# Truncate text if too long
|
|
65
|
+
max_text_length = config[:max_text_length] || 4000
|
|
66
|
+
truncated_text = text.length > max_text_length ? text[0...max_text_length] + "..." : text
|
|
67
|
+
|
|
68
|
+
# Build prompt based on topic and options
|
|
69
|
+
prompt = build_tag_generation_prompt(truncated_text, topic, languages, options)
|
|
70
|
+
|
|
71
|
+
# Call LLM to generate tags (with error handling)
|
|
72
|
+
response = call_llm_for_tags(prompt, options)
|
|
73
|
+
|
|
74
|
+
# Parse and validate the response
|
|
75
|
+
parse_generated_tags(response, languages)
|
|
76
|
+
rescue ArgumentError
|
|
77
|
+
# Re-raise ArgumentError as-is for empty text validation
|
|
78
|
+
raise
|
|
79
|
+
rescue StandardError => e
|
|
80
|
+
log_error("Failed to generate tags", e)
|
|
81
|
+
# Raise TagGenerationError instead of returning empty result
|
|
82
|
+
raise ::SmartRAG::Errors::TagGenerationError, "Tag generation failed: #{e.message}"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Batch generate tags for multiple sections
|
|
86
|
+
# @param sections [Array<SourceSection>] Sections to generate tags for
|
|
87
|
+
# @param topic [String] Topic/context for the content
|
|
88
|
+
# @param languages [Array<Symbol>] Target languages for tags
|
|
89
|
+
# @param options [Hash] Additional options
|
|
90
|
+
# @return [Hash] Mapping of section_id to generated tags
|
|
91
|
+
def batch_generate_tags(sections, topic = nil, languages = [:zh_cn, :en], options = {})
|
|
92
|
+
raise ArgumentError, "Sections cannot be nil" unless sections
|
|
93
|
+
return {} if sections.empty?
|
|
94
|
+
|
|
95
|
+
logger.info "Generating tags for #{sections.size} sections"
|
|
96
|
+
|
|
97
|
+
result = {}
|
|
98
|
+
sections.each_with_index do |section, index|
|
|
99
|
+
begin
|
|
100
|
+
section_text = prepare_section_text(section)
|
|
101
|
+
tags = generate_tags(section_text, topic, languages, options)
|
|
102
|
+
result[section.id] = tags
|
|
103
|
+
|
|
104
|
+
logger.info "Generated tags for section #{section.id} (#{index + 1}/#{sections.size})"
|
|
105
|
+
rescue StandardError => e
|
|
106
|
+
logger.error "Failed to generate tags for section #{section.id}: #{e.message}"
|
|
107
|
+
result[section.id] = { categories: [], content_tags: [] }
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
result
|
|
112
|
+
rescue StandardError => e
|
|
113
|
+
log_error("Failed to batch generate tags", e)
|
|
114
|
+
raise ::SmartRAG::Errors::TagGenerationError, "Batch tag generation failed: #{e.message}"
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Find or create tags by names
|
|
118
|
+
# @param tag_names [Array<String>] Tag names to find or create
|
|
119
|
+
# @param parent_id [Integer] Parent tag ID for hierarchical tags
|
|
120
|
+
# @param options [Hash] Additional options
|
|
121
|
+
# @return [Array<Tag>] Array of tag objects
|
|
122
|
+
def find_or_create_tags(tag_names, parent_id = nil, options = {})
|
|
123
|
+
raise ArgumentError, "Tag names cannot be nil" unless tag_names
|
|
124
|
+
return [] if tag_names.empty?
|
|
125
|
+
|
|
126
|
+
# Ensure unique and clean tag names
|
|
127
|
+
clean_names = tag_names.map { |name| clean_tag_name(name) }.uniq.compact
|
|
128
|
+
|
|
129
|
+
# Process in transaction
|
|
130
|
+
Models::Tag.db.transaction do
|
|
131
|
+
clean_names.map do |name|
|
|
132
|
+
Models::Tag.find_or_create(name, parent_id: parent_id)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
rescue StandardError => e
|
|
136
|
+
log_error("Failed to find or create tags", e)
|
|
137
|
+
raise ::SmartRAG::Errors::DatabaseError, "Tag creation failed: #{e.message}"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Create hierarchical tags from nested structure
|
|
141
|
+
# @param hierarchy [Hash] Tag hierarchy structure
|
|
142
|
+
# @param parent_id [Integer] Parent tag ID (for recursive calls)
|
|
143
|
+
# @return [Array<Tag>] Created tags
|
|
144
|
+
# @example
|
|
145
|
+
# create_hierarchy({
|
|
146
|
+
# "Technology" => {
|
|
147
|
+
# "AI" => ["Machine Learning", "Deep Learning"],
|
|
148
|
+
# "Web" => ["Frontend", "Backend"]
|
|
149
|
+
# }
|
|
150
|
+
# })
|
|
151
|
+
def create_hierarchy(hierarchy, parent_id = nil)
|
|
152
|
+
raise ArgumentError, "Hierarchy cannot be nil" unless hierarchy
|
|
153
|
+
return [] if hierarchy.empty?
|
|
154
|
+
|
|
155
|
+
created_tags = []
|
|
156
|
+
|
|
157
|
+
Models::Tag.db.transaction do
|
|
158
|
+
hierarchy.each do |name, children|
|
|
159
|
+
# Create current tag
|
|
160
|
+
tag = Models::Tag.find_or_create(clean_tag_name(name), parent_id: parent_id)
|
|
161
|
+
created_tags << tag
|
|
162
|
+
|
|
163
|
+
# Recursively create children
|
|
164
|
+
if children.is_a?(Hash)
|
|
165
|
+
# Nested structure
|
|
166
|
+
created_tags.concat(create_hierarchy(children, tag.id))
|
|
167
|
+
elsif children.is_a?(Array)
|
|
168
|
+
# Array of child names
|
|
169
|
+
child_tags = find_or_create_tags(children, tag.id)
|
|
170
|
+
created_tags.concat(child_tags)
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
created_tags.uniq
|
|
176
|
+
rescue StandardError => e
|
|
177
|
+
log_error("Failed to create tag hierarchy", e)
|
|
178
|
+
raise ::SmartRAG::Errors::DatabaseError, "Hierarchy creation failed: #{e.message}"
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Associate tags with a section
|
|
182
|
+
# @param section [SourceSection] The section to tag
|
|
183
|
+
# @param tags [Array<Tag>] Tags to associate
|
|
184
|
+
# @param options [Hash] Options (e.g., replace_existing: false)
|
|
185
|
+
# @return [Array<SectionTag>] Created associations
|
|
186
|
+
def associate_with_section(section, tags, options = {})
|
|
187
|
+
raise ArgumentError, "Section cannot be nil" unless section
|
|
188
|
+
raise ArgumentError, "Tags cannot be nil" unless tags
|
|
189
|
+
return [] if tags.empty?
|
|
190
|
+
|
|
191
|
+
# Ensure all tags exist and are Tag objects
|
|
192
|
+
begin
|
|
193
|
+
tag_objects = ensure_tag_objects(tags)
|
|
194
|
+
rescue ArgumentError => e
|
|
195
|
+
raise ArgumentError, e.message
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
Models::Tag.db.transaction do
|
|
199
|
+
# Remove existing tags if replace option is set
|
|
200
|
+
if options[:replace_existing]
|
|
201
|
+
section.remove_all_tags
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Create associations
|
|
205
|
+
tag_objects.map do |tag|
|
|
206
|
+
begin
|
|
207
|
+
section.add_tag(tag)
|
|
208
|
+
Models::SectionTag.find(section_id: section.id, tag_id: tag.id)
|
|
209
|
+
rescue Sequel::UniqueConstraintViolation
|
|
210
|
+
# Already associated, find existing
|
|
211
|
+
Models::SectionTag.find(section_id: section.id, tag_id: tag.id)
|
|
212
|
+
end
|
|
213
|
+
end.compact
|
|
214
|
+
end
|
|
215
|
+
rescue ArgumentError
|
|
216
|
+
raise
|
|
217
|
+
rescue StandardError => e
|
|
218
|
+
log_error("Failed to associate tags with section", e)
|
|
219
|
+
raise ::SmartRAG::Errors::DatabaseError, "Tag association failed: #{e.message}"
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Associate tags with multiple sections (batch)
|
|
223
|
+
# @param sections [Array<SourceSection>] Sections to tag
|
|
224
|
+
# @param tags [Array<Tag>] Tags to associate
|
|
225
|
+
# @param options [Hash] Association options
|
|
226
|
+
def batch_associate_with_sections(sections, tags, options = {})
|
|
227
|
+
raise ArgumentError, "Sections cannot be nil" unless sections
|
|
228
|
+
raise ArgumentError, "Tags cannot be nil" unless tags
|
|
229
|
+
return [] if sections.empty? || tags.empty?
|
|
230
|
+
|
|
231
|
+
tag_objects = ensure_tag_objects(tags)
|
|
232
|
+
results = []
|
|
233
|
+
|
|
234
|
+
Models::Tag.db.transaction do
|
|
235
|
+
sections.each do |section|
|
|
236
|
+
begin
|
|
237
|
+
associations = associate_with_section(section, tag_objects, options)
|
|
238
|
+
results.concat(associations)
|
|
239
|
+
rescue StandardError => e
|
|
240
|
+
logger.error "Failed to associate tags with section #{section.id}: #{e.message}"
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
results
|
|
246
|
+
rescue StandardError => e
|
|
247
|
+
log_error("Failed to batch associate tags", e)
|
|
248
|
+
raise ::SmartRAG::Errors::DatabaseError, "Batch association failed: #{e.message}"
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Remove tag associations from a section
|
|
252
|
+
# @param section [SourceSection] The section
|
|
253
|
+
# @param tags [Array<Tag>] Tags to remove (if nil, remove all)
|
|
254
|
+
# @return [Integer] Number of removed associations
|
|
255
|
+
def dissociate_from_section(section, tags = nil)
|
|
256
|
+
raise ArgumentError, "Section cannot be nil" unless section
|
|
257
|
+
|
|
258
|
+
if tags.nil?
|
|
259
|
+
# Remove all tags
|
|
260
|
+
removed_count = section.tags.count
|
|
261
|
+
section.remove_all_tags
|
|
262
|
+
removed_count
|
|
263
|
+
else
|
|
264
|
+
# Remove specific tags
|
|
265
|
+
begin
|
|
266
|
+
tag_objects = ensure_tag_objects(tags)
|
|
267
|
+
rescue ArgumentError => e
|
|
268
|
+
raise ArgumentError, e.message
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
removed_count = 0
|
|
272
|
+
|
|
273
|
+
Models::Tag.db.transaction do
|
|
274
|
+
tag_objects.each do |tag|
|
|
275
|
+
if section.tags.include?(tag)
|
|
276
|
+
section.remove_tag(tag)
|
|
277
|
+
removed_count += 1
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
removed_count
|
|
283
|
+
end
|
|
284
|
+
rescue ArgumentError
|
|
285
|
+
raise
|
|
286
|
+
rescue StandardError => e
|
|
287
|
+
log_error("Failed to dissociate tags from section", e)
|
|
288
|
+
raise ::SmartRAG::Errors::DatabaseError, "Tag dissociation failed: #{e.message}"
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Get sections by tag with optional filters
|
|
292
|
+
# @param tag [Tag] The tag to filter by
|
|
293
|
+
# @param options [Hash] Filter options
|
|
294
|
+
# @return [Array<SourceSection>] Filtered sections
|
|
295
|
+
def get_sections_by_tag(tag, options = {})
|
|
296
|
+
raise ArgumentError, "Tag cannot be nil" unless tag
|
|
297
|
+
|
|
298
|
+
query = tag.sections_dataset
|
|
299
|
+
|
|
300
|
+
# Apply filters
|
|
301
|
+
if options[:document_id]
|
|
302
|
+
query = query.where(document_id: options[:document_id])
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
if options[:has_embedding]
|
|
306
|
+
query = query.association_join(:embedding)
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
if options[:limit]
|
|
310
|
+
query = query.limit(options[:limit])
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
query.all
|
|
314
|
+
rescue StandardError => e
|
|
315
|
+
log_error("Failed to get sections by tag", e)
|
|
316
|
+
raise ::SmartRAG::Errors::DatabaseError, "Tag query failed: #{e.message}"
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Get tags for a section
|
|
320
|
+
# @param section [SourceSection] The section
|
|
321
|
+
# @param include_ancestors [Boolean] Include ancestor tags
|
|
322
|
+
# @return [Array<Tag>] Tags for the section
|
|
323
|
+
def get_tags_for_section(section, include_ancestors: false)
|
|
324
|
+
raise ArgumentError, "Section cannot be nil" unless section
|
|
325
|
+
|
|
326
|
+
base_tags = section.tags
|
|
327
|
+
|
|
328
|
+
return base_tags unless include_ancestors
|
|
329
|
+
|
|
330
|
+
# Include ancestor tags in hierarchy
|
|
331
|
+
all_tags = base_tags.dup
|
|
332
|
+
base_tags.each do |tag|
|
|
333
|
+
all_tags.concat(tag.ancestors)
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
all_tags.uniq
|
|
337
|
+
rescue StandardError => e
|
|
338
|
+
log_error("Failed to get tags for section", e)
|
|
339
|
+
raise ::SmartRAG::Errors::DatabaseError, "Tag query failed: #{e.message}"
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
# Search tags by name with optional filters
|
|
343
|
+
# @param query [String] Search query
|
|
344
|
+
# @param options [Hash] Search options
|
|
345
|
+
# @option options [Integer] :limit Maximum results (default: 20)
|
|
346
|
+
# @option options [Boolean] :include_usage Include usage count
|
|
347
|
+
# @return [Array<Tag>] Matching tags
|
|
348
|
+
def search_tags(query, options = {})
|
|
349
|
+
raise ArgumentError, "Query cannot be empty" if query.to_s.strip.empty?
|
|
350
|
+
|
|
351
|
+
limit = options[:limit] || 20
|
|
352
|
+
search_pattern = "%#{query.downcase}%"
|
|
353
|
+
|
|
354
|
+
base_query = Models::Tag.where(Sequel.ilike(:name, search_pattern))
|
|
355
|
+
|
|
356
|
+
if options[:include_usage]
|
|
357
|
+
base_query = Models::Tag.with_section_count
|
|
358
|
+
.where(Sequel.ilike(:name, search_pattern))
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
base_query.limit(limit).all
|
|
362
|
+
rescue ArgumentError
|
|
363
|
+
raise
|
|
364
|
+
rescue StandardError => e
|
|
365
|
+
log_error("Failed to search tags", e)
|
|
366
|
+
raise ::SmartRAG::Errors::DatabaseError, "Tag search failed: #{e.message}"
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
# Get popular tags
|
|
370
|
+
# @param limit [Integer] Number of results (default: 20)
|
|
371
|
+
# @return [Array<Hash>] Popular tags with usage count
|
|
372
|
+
def get_popular_tags(limit = 20)
|
|
373
|
+
Models::Tag.popular(limit: limit)
|
|
374
|
+
rescue StandardError => e
|
|
375
|
+
log_error("Failed to get popular tags", e)
|
|
376
|
+
raise ::SmartRAG::Errors::DatabaseError, "Popular tags query failed: #{e.message}"
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
# Get tag hierarchy tree
|
|
380
|
+
# @return [Array<Hash>] Hierarchical tag structure
|
|
381
|
+
def get_tag_hierarchy
|
|
382
|
+
Models::Tag.hierarchy
|
|
383
|
+
rescue StandardError => e
|
|
384
|
+
log_error("Failed to get tag hierarchy", e)
|
|
385
|
+
raise ::SmartRAG::Errors::DatabaseError, "Hierarchy query failed: #{e.message}"
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
private
|
|
389
|
+
|
|
390
|
+
def build_tag_generation_prompt(text, topic, languages, options)
|
|
391
|
+
max_category_tags = options[:max_category_tags] || 3
|
|
392
|
+
max_content_tags = options[:max_content_tags] || 5
|
|
393
|
+
include_category = options.fetch(:include_category, true)
|
|
394
|
+
include_content = options.fetch(:include_content, true)
|
|
395
|
+
|
|
396
|
+
prompts = []
|
|
397
|
+
|
|
398
|
+
languages.each do |lang|
|
|
399
|
+
case lang.to_s
|
|
400
|
+
when "zh"
|
|
401
|
+
prompts << build_chinese_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
|
|
402
|
+
when "en"
|
|
403
|
+
prompts << build_english_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
|
|
404
|
+
else
|
|
405
|
+
logger.warn "Unsupported language: #{lang}, skipping"
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
prompts.join("\n\n---\n\n")
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def build_chinese_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
|
|
413
|
+
prompt = "分析以下文本并生成标签:\n\n"
|
|
414
|
+
prompt += "文本内容:\n#{text}\n\n"
|
|
415
|
+
prompt += "主题:#{topic}\n\n" if topic
|
|
416
|
+
|
|
417
|
+
prompt += "要求:\n"
|
|
418
|
+
|
|
419
|
+
if include_category
|
|
420
|
+
prompt += "1. 生成#{max_category_tags}个以内的高层级分类标签(如:人工智能、机器学习、深度学习等)\n"
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
if include_content
|
|
424
|
+
prompt += "#{include_category ? "2" : "1"}. 生成#{max_content_tags}个以内的具体内容标签(描述文本的关键概念、技术、方法等)\n"
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
prompt += "\n"
|
|
428
|
+
prompt += "以下列JSON格式输出:\n"
|
|
429
|
+
prompt += "{\"categories\": [...], \"content_tags\": [...]}\n"
|
|
430
|
+
prompt += "只输出JSON,不要额外解释。"
|
|
431
|
+
|
|
432
|
+
prompt
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
def build_english_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
|
|
436
|
+
prompt = "Analyze the following text and generate tags:\n\n"
|
|
437
|
+
prompt += "Text content:\n#{text}\n\n"
|
|
438
|
+
prompt += "Topic: #{topic}\n\n" if topic
|
|
439
|
+
|
|
440
|
+
prompt += "Requirements:\n"
|
|
441
|
+
|
|
442
|
+
if include_category
|
|
443
|
+
prompt += "1. Generate up to #{max_category_tags} high-level category tags (e.g., Artificial Intelligence, Machine Learning, Deep Learning)\n"
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
if include_content
|
|
447
|
+
prompt += "#{include_category ? "2" : "1"}. Generate up to #{max_content_tags} specific content tags (describing key concepts, techniques, methods from the text)\n"
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
prompt += "\n"
|
|
451
|
+
prompt += "Output in the following JSON format:\n"
|
|
452
|
+
prompt += '{"categories": [...], "content_tags": [...]}' + "\n"
|
|
453
|
+
prompt += "Output only JSON, no additional explanation."
|
|
454
|
+
|
|
455
|
+
prompt
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
def call_llm_for_tags(prompt, options)
|
|
459
|
+
max_retries = options[:retries] || config[:max_retries]
|
|
460
|
+
timeout = options[:timeout] || config[:timeout]
|
|
461
|
+
|
|
462
|
+
with_retry(max_retries: max_retries, timeout: timeout) do
|
|
463
|
+
result = smart_prompt_engine.call_worker(:analyze_content, { content: prompt })
|
|
464
|
+
raise "No response from LLM" unless result
|
|
465
|
+
|
|
466
|
+
result
|
|
467
|
+
end
|
|
468
|
+
rescue StandardError => e
|
|
469
|
+
logger.error "LLM call failed: #{e.message}"
|
|
470
|
+
raise
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
def parse_generated_tags(response, languages)
|
|
474
|
+
# Handle nil or empty response
|
|
475
|
+
return { categories: [], content_tags: [] } if response.nil? || response.empty?
|
|
476
|
+
|
|
477
|
+
raw_content = extract_response_content(response)
|
|
478
|
+
return { categories: [], content_tags: [] } if raw_content.to_s.strip.empty?
|
|
479
|
+
|
|
480
|
+
# Try to parse as JSON first
|
|
481
|
+
begin
|
|
482
|
+
parsed = JSON.parse(raw_content)
|
|
483
|
+
return {
|
|
484
|
+
categories: (parsed["categories"] || []).map { |c| clean_tag_name(c) }.compact,
|
|
485
|
+
content_tags: (parsed["content_tags"] || []).map { |c| clean_tag_name(c) }.compact,
|
|
486
|
+
}
|
|
487
|
+
rescue JSON::ParserError
|
|
488
|
+
# Fallback to manual parsing
|
|
489
|
+
logger.warn "Failed to parse LLM response as JSON, attempting manual parsing"
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
# Manual parsing for malformed responses
|
|
493
|
+
categories = []
|
|
494
|
+
content_tags = []
|
|
495
|
+
|
|
496
|
+
# Look for JSON-like patterns
|
|
497
|
+
if raw_content =~ /\{\s*"categories"\s*:\s*\[([^\]]+)\]/
|
|
498
|
+
categories_str = $1
|
|
499
|
+
categories = categories_str.scan(/"([^"]+)"/).flatten
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
if raw_content =~ /"content_tags"\s*:\s*\[([^\]]+)\]/
|
|
503
|
+
content_str = $1
|
|
504
|
+
content_tags = content_str.scan(/"([^"]+)"/).flatten
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
# If still no results, try to extract quoted strings
|
|
508
|
+
if categories.empty? && content_tags.empty?
|
|
509
|
+
all_quotes = raw_content.scan(/"([^"]+)"/).flatten
|
|
510
|
+
if all_quotes.any?
|
|
511
|
+
# Heuristic: first few are categories, rest are content tags
|
|
512
|
+
midpoint = [all_quotes.length / 3, 2].max
|
|
513
|
+
categories = all_quotes[0...midpoint]
|
|
514
|
+
content_tags = all_quotes[midpoint..-1]
|
|
515
|
+
end
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
{
|
|
519
|
+
categories: categories.map { |c| clean_tag_name(c) }.compact,
|
|
520
|
+
content_tags: content_tags.map { |c| clean_tag_name(c) }.compact,
|
|
521
|
+
}
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
def extract_response_content(response)
|
|
525
|
+
case response
|
|
526
|
+
when Hash
|
|
527
|
+
response.dig("choices", 0, "message", "content") ||
|
|
528
|
+
response.dig(:choices, 0, :message, :content) ||
|
|
529
|
+
response["content"] ||
|
|
530
|
+
response[:content] ||
|
|
531
|
+
response.to_s
|
|
532
|
+
else
|
|
533
|
+
response.to_s
|
|
534
|
+
end
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
def prepare_section_text(section)
|
|
538
|
+
parts = []
|
|
539
|
+
parts << "Section #{section.section_number}: #{section.section_title}" if section.section_number && section.section_title
|
|
540
|
+
parts << section.section_title if section.section_title && parts.empty?
|
|
541
|
+
parts << section.content
|
|
542
|
+
|
|
543
|
+
parts.compact.join("\n\n")
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
def ensure_tag_objects(tags)
|
|
547
|
+
tags.map do |tag|
|
|
548
|
+
case tag
|
|
549
|
+
when Models::Tag
|
|
550
|
+
tag
|
|
551
|
+
when Integer
|
|
552
|
+
Models::Tag.find(id: tag) || raise(ArgumentError, "Tag not found: #{tag}")
|
|
553
|
+
when String
|
|
554
|
+
Models::Tag.find(name: tag) || raise(ArgumentError, "Tag not found: #{tag}")
|
|
555
|
+
else
|
|
556
|
+
# Support RSpec mocks and other test doubles
|
|
557
|
+
if tag.respond_to?(:id) && tag.respond_to?(:name)
|
|
558
|
+
tag
|
|
559
|
+
else
|
|
560
|
+
raise ArgumentError, "Invalid tag type: #{tag.class}"
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
end
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
def clean_tag_name(name)
|
|
567
|
+
return nil if name.nil?
|
|
568
|
+
|
|
569
|
+
cleaned = name.to_s.strip
|
|
570
|
+
return nil if cleaned.empty?
|
|
571
|
+
|
|
572
|
+
# Normalize the tag name
|
|
573
|
+
return cleaned if cleaned.match?(/\p{Han}/)
|
|
574
|
+
return cleaned.gsub(/[^\w\s\-]/, " ").gsub(/\s+/, " ").strip
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
def with_retry(max_retries:, timeout:, &block)
|
|
578
|
+
last_exception = nil
|
|
579
|
+
|
|
580
|
+
max_retries.times do |attempt|
|
|
581
|
+
begin
|
|
582
|
+
Timeout.timeout(timeout) do
|
|
583
|
+
return yield
|
|
584
|
+
end
|
|
585
|
+
rescue StandardError => e
|
|
586
|
+
last_exception = e
|
|
587
|
+
logger.warn "Attempt #{attempt + 1} failed: #{e.message}"
|
|
588
|
+
|
|
589
|
+
# Exponential backoff
|
|
590
|
+
sleep(2 ** attempt) if attempt < max_retries - 1
|
|
591
|
+
end
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
raise last_exception
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
def log_error(message, exception)
|
|
598
|
+
logger.error "#{message}: #{exception.message}"
|
|
599
|
+
logger.error exception.backtrace.join("\n")
|
|
600
|
+
end
|
|
601
|
+
|
|
602
|
+
def default_config
|
|
603
|
+
{
|
|
604
|
+
config_path: "config/llm_config.yml",
|
|
605
|
+
max_retries: 3,
|
|
606
|
+
timeout: 30,
|
|
607
|
+
batch_size: 50,
|
|
608
|
+
max_text_length: 4000,
|
|
609
|
+
logger: Logger.new(STDOUT),
|
|
610
|
+
}
|
|
611
|
+
end
|
|
612
|
+
end
|
|
613
|
+
end
|
|
614
|
+
end
|