smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,614 @@
1
+ require_relative "../models/tag"
2
+ require_relative "../models/section_tag"
3
+ require_relative "../models/source_section"
4
+ require_relative "../errors"
5
+ require "smart_prompt"
6
+
7
+ module SmartRAG
8
+ module Services
9
+ # Service for managing tags, including generation, hierarchy, and content associations
10
+ class TagService
11
+ attr_reader :config, :smart_prompt_engine, :logger
12
+
13
+ # Initialize the tag service
14
+ # @param config [Hash] Configuration options
15
+ # @option config [String] :config_path Path to smart_prompt config (default: config/llm_config.yml)
16
+ # @option config [Integer] :max_retries Maximum retries for API calls (default: 3)
17
+ # @option config [Integer] :timeout Timeout for API calls (default: 30)
18
+ # @option config [Logger] :logger Logger instance (default: Logger.new(STDOUT))
19
+ def initialize(config = {})
20
+ config ||= {}
21
+ @logger = Logger.new(STDOUT)
22
+ @config = default_config.merge(config)
23
+ @logger = @config[:logger] || @logger
24
+
25
+ # Load workers
26
+ workers_dir = File.join(File.dirname(__FILE__), "..", "..", "..", "workers")
27
+ Dir.glob(File.join(workers_dir, "*.rb")).each { |file| require file }
28
+
29
+ # Initialize SmartPrompt engine
30
+ config_path = @config[:config_path] || "config/llm_config.yml"
31
+ @smart_prompt_engine = SmartPrompt::Engine.new(config_path)
32
+ rescue StandardError => e
33
+ log_error("Failed to initialize TagService", e)
34
+ raise
35
+ end
36
+
37
+ # Validate input for tag generation
38
+ # @param text [String] The text to validate
39
+ # @raise [ArgumentError] if text is nil or empty
40
+ def validate_input!(text)
41
+ raise ArgumentError, "Text cannot be empty" if text.to_s.strip.empty?
42
+ end
43
+
44
+ # Generate tags for text content using LLM
45
+ # @param text [String] Text content to analyze
46
+ # @param topic [String] Topic/context for the text (optional)
47
+ # @param languages [Array<Symbol>] Target languages for tags (e.g., [:zh_cn, :en])
48
+ # @param options [Hash] Additional options
49
+ # @option options [Boolean] :include_category Include category tags (default: true)
50
+ # @option options [Boolean] :include_content Include content tags (default: true)
51
+ # @option options [Integer] :max_category_tags Maximum category tags (default: 5)
52
+ # @option options [Integer] :max_content_tags Maximum content tags (default: 10)
53
+ # @return [Hash] Generated tags structure with categories and content tags
54
+ # @example
55
+ # generate_tags("Machine learning algorithms...", topic: "AI Research")
56
+ # # => {
57
+ # # categories: ["Machine Learning", "Artificial Intelligence"],
58
+ # # content_tags: ["Neural Networks", "Deep Learning", "Training Data"]
59
+ # # }
60
+ def generate_tags(text, topic = nil, languages = [:zh_cn, :en], options = {})
61
+ # Validate input - this will raise ArgumentError if empty, which is expected by tests
62
+ validate_input!(text)
63
+
64
+ # Truncate text if too long
65
+ max_text_length = config[:max_text_length] || 4000
66
+ truncated_text = text.length > max_text_length ? text[0...max_text_length] + "..." : text
67
+
68
+ # Build prompt based on topic and options
69
+ prompt = build_tag_generation_prompt(truncated_text, topic, languages, options)
70
+
71
+ # Call LLM to generate tags (with error handling)
72
+ response = call_llm_for_tags(prompt, options)
73
+
74
+ # Parse and validate the response
75
+ parse_generated_tags(response, languages)
76
+ rescue ArgumentError
77
+ # Re-raise ArgumentError as-is for empty text validation
78
+ raise
79
+ rescue StandardError => e
80
+ log_error("Failed to generate tags", e)
81
+ # Raise TagGenerationError instead of returning empty result
82
+ raise ::SmartRAG::Errors::TagGenerationError, "Tag generation failed: #{e.message}"
83
+ end
84
+
85
+ # Batch generate tags for multiple sections
86
+ # @param sections [Array<SourceSection>] Sections to generate tags for
87
+ # @param topic [String] Topic/context for the content
88
+ # @param languages [Array<Symbol>] Target languages for tags
89
+ # @param options [Hash] Additional options
90
+ # @return [Hash] Mapping of section_id to generated tags
91
+ def batch_generate_tags(sections, topic = nil, languages = [:zh_cn, :en], options = {})
92
+ raise ArgumentError, "Sections cannot be nil" unless sections
93
+ return {} if sections.empty?
94
+
95
+ logger.info "Generating tags for #{sections.size} sections"
96
+
97
+ result = {}
98
+ sections.each_with_index do |section, index|
99
+ begin
100
+ section_text = prepare_section_text(section)
101
+ tags = generate_tags(section_text, topic, languages, options)
102
+ result[section.id] = tags
103
+
104
+ logger.info "Generated tags for section #{section.id} (#{index + 1}/#{sections.size})"
105
+ rescue StandardError => e
106
+ logger.error "Failed to generate tags for section #{section.id}: #{e.message}"
107
+ result[section.id] = { categories: [], content_tags: [] }
108
+ end
109
+ end
110
+
111
+ result
112
+ rescue StandardError => e
113
+ log_error("Failed to batch generate tags", e)
114
+ raise ::SmartRAG::Errors::TagGenerationError, "Batch tag generation failed: #{e.message}"
115
+ end
116
+
117
+ # Find or create tags by names
118
+ # @param tag_names [Array<String>] Tag names to find or create
119
+ # @param parent_id [Integer] Parent tag ID for hierarchical tags
120
+ # @param options [Hash] Additional options
121
+ # @return [Array<Tag>] Array of tag objects
122
+ def find_or_create_tags(tag_names, parent_id = nil, options = {})
123
+ raise ArgumentError, "Tag names cannot be nil" unless tag_names
124
+ return [] if tag_names.empty?
125
+
126
+ # Ensure unique and clean tag names
127
+ clean_names = tag_names.map { |name| clean_tag_name(name) }.uniq.compact
128
+
129
+ # Process in transaction
130
+ Models::Tag.db.transaction do
131
+ clean_names.map do |name|
132
+ Models::Tag.find_or_create(name, parent_id: parent_id)
133
+ end
134
+ end
135
+ rescue StandardError => e
136
+ log_error("Failed to find or create tags", e)
137
+ raise ::SmartRAG::Errors::DatabaseError, "Tag creation failed: #{e.message}"
138
+ end
139
+
140
+ # Create hierarchical tags from nested structure
141
+ # @param hierarchy [Hash] Tag hierarchy structure
142
+ # @param parent_id [Integer] Parent tag ID (for recursive calls)
143
+ # @return [Array<Tag>] Created tags
144
+ # @example
145
+ # create_hierarchy({
146
+ # "Technology" => {
147
+ # "AI" => ["Machine Learning", "Deep Learning"],
148
+ # "Web" => ["Frontend", "Backend"]
149
+ # }
150
+ # })
151
+ def create_hierarchy(hierarchy, parent_id = nil)
152
+ raise ArgumentError, "Hierarchy cannot be nil" unless hierarchy
153
+ return [] if hierarchy.empty?
154
+
155
+ created_tags = []
156
+
157
+ Models::Tag.db.transaction do
158
+ hierarchy.each do |name, children|
159
+ # Create current tag
160
+ tag = Models::Tag.find_or_create(clean_tag_name(name), parent_id: parent_id)
161
+ created_tags << tag
162
+
163
+ # Recursively create children
164
+ if children.is_a?(Hash)
165
+ # Nested structure
166
+ created_tags.concat(create_hierarchy(children, tag.id))
167
+ elsif children.is_a?(Array)
168
+ # Array of child names
169
+ child_tags = find_or_create_tags(children, tag.id)
170
+ created_tags.concat(child_tags)
171
+ end
172
+ end
173
+ end
174
+
175
+ created_tags.uniq
176
+ rescue StandardError => e
177
+ log_error("Failed to create tag hierarchy", e)
178
+ raise ::SmartRAG::Errors::DatabaseError, "Hierarchy creation failed: #{e.message}"
179
+ end
180
+
181
+ # Associate tags with a section
182
+ # @param section [SourceSection] The section to tag
183
+ # @param tags [Array<Tag>] Tags to associate
184
+ # @param options [Hash] Options (e.g., replace_existing: false)
185
+ # @return [Array<SectionTag>] Created associations
186
+ def associate_with_section(section, tags, options = {})
187
+ raise ArgumentError, "Section cannot be nil" unless section
188
+ raise ArgumentError, "Tags cannot be nil" unless tags
189
+ return [] if tags.empty?
190
+
191
+ # Ensure all tags exist and are Tag objects
192
+ begin
193
+ tag_objects = ensure_tag_objects(tags)
194
+ rescue ArgumentError => e
195
+ raise ArgumentError, e.message
196
+ end
197
+
198
+ Models::Tag.db.transaction do
199
+ # Remove existing tags if replace option is set
200
+ if options[:replace_existing]
201
+ section.remove_all_tags
202
+ end
203
+
204
+ # Create associations
205
+ tag_objects.map do |tag|
206
+ begin
207
+ section.add_tag(tag)
208
+ Models::SectionTag.find(section_id: section.id, tag_id: tag.id)
209
+ rescue Sequel::UniqueConstraintViolation
210
+ # Already associated, find existing
211
+ Models::SectionTag.find(section_id: section.id, tag_id: tag.id)
212
+ end
213
+ end.compact
214
+ end
215
+ rescue ArgumentError
216
+ raise
217
+ rescue StandardError => e
218
+ log_error("Failed to associate tags with section", e)
219
+ raise ::SmartRAG::Errors::DatabaseError, "Tag association failed: #{e.message}"
220
+ end
221
+
222
+ # Associate tags with multiple sections (batch)
223
+ # @param sections [Array<SourceSection>] Sections to tag
224
+ # @param tags [Array<Tag>] Tags to associate
225
+ # @param options [Hash] Association options
226
+ def batch_associate_with_sections(sections, tags, options = {})
227
+ raise ArgumentError, "Sections cannot be nil" unless sections
228
+ raise ArgumentError, "Tags cannot be nil" unless tags
229
+ return [] if sections.empty? || tags.empty?
230
+
231
+ tag_objects = ensure_tag_objects(tags)
232
+ results = []
233
+
234
+ Models::Tag.db.transaction do
235
+ sections.each do |section|
236
+ begin
237
+ associations = associate_with_section(section, tag_objects, options)
238
+ results.concat(associations)
239
+ rescue StandardError => e
240
+ logger.error "Failed to associate tags with section #{section.id}: #{e.message}"
241
+ end
242
+ end
243
+ end
244
+
245
+ results
246
+ rescue StandardError => e
247
+ log_error("Failed to batch associate tags", e)
248
+ raise ::SmartRAG::Errors::DatabaseError, "Batch association failed: #{e.message}"
249
+ end
250
+
251
+ # Remove tag associations from a section
252
+ # @param section [SourceSection] The section
253
+ # @param tags [Array<Tag>] Tags to remove (if nil, remove all)
254
+ # @return [Integer] Number of removed associations
255
+ def dissociate_from_section(section, tags = nil)
256
+ raise ArgumentError, "Section cannot be nil" unless section
257
+
258
+ if tags.nil?
259
+ # Remove all tags
260
+ removed_count = section.tags.count
261
+ section.remove_all_tags
262
+ removed_count
263
+ else
264
+ # Remove specific tags
265
+ begin
266
+ tag_objects = ensure_tag_objects(tags)
267
+ rescue ArgumentError => e
268
+ raise ArgumentError, e.message
269
+ end
270
+
271
+ removed_count = 0
272
+
273
+ Models::Tag.db.transaction do
274
+ tag_objects.each do |tag|
275
+ if section.tags.include?(tag)
276
+ section.remove_tag(tag)
277
+ removed_count += 1
278
+ end
279
+ end
280
+ end
281
+
282
+ removed_count
283
+ end
284
+ rescue ArgumentError
285
+ raise
286
+ rescue StandardError => e
287
+ log_error("Failed to dissociate tags from section", e)
288
+ raise ::SmartRAG::Errors::DatabaseError, "Tag dissociation failed: #{e.message}"
289
+ end
290
+
291
+ # Get sections by tag with optional filters
292
+ # @param tag [Tag] The tag to filter by
293
+ # @param options [Hash] Filter options
294
+ # @return [Array<SourceSection>] Filtered sections
295
+ def get_sections_by_tag(tag, options = {})
296
+ raise ArgumentError, "Tag cannot be nil" unless tag
297
+
298
+ query = tag.sections_dataset
299
+
300
+ # Apply filters
301
+ if options[:document_id]
302
+ query = query.where(document_id: options[:document_id])
303
+ end
304
+
305
+ if options[:has_embedding]
306
+ query = query.association_join(:embedding)
307
+ end
308
+
309
+ if options[:limit]
310
+ query = query.limit(options[:limit])
311
+ end
312
+
313
+ query.all
314
+ rescue StandardError => e
315
+ log_error("Failed to get sections by tag", e)
316
+ raise ::SmartRAG::Errors::DatabaseError, "Tag query failed: #{e.message}"
317
+ end
318
+
319
+ # Get tags for a section
320
+ # @param section [SourceSection] The section
321
+ # @param include_ancestors [Boolean] Include ancestor tags
322
+ # @return [Array<Tag>] Tags for the section
323
+ def get_tags_for_section(section, include_ancestors: false)
324
+ raise ArgumentError, "Section cannot be nil" unless section
325
+
326
+ base_tags = section.tags
327
+
328
+ return base_tags unless include_ancestors
329
+
330
+ # Include ancestor tags in hierarchy
331
+ all_tags = base_tags.dup
332
+ base_tags.each do |tag|
333
+ all_tags.concat(tag.ancestors)
334
+ end
335
+
336
+ all_tags.uniq
337
+ rescue StandardError => e
338
+ log_error("Failed to get tags for section", e)
339
+ raise ::SmartRAG::Errors::DatabaseError, "Tag query failed: #{e.message}"
340
+ end
341
+
342
+ # Search tags by name with optional filters
343
+ # @param query [String] Search query
344
+ # @param options [Hash] Search options
345
+ # @option options [Integer] :limit Maximum results (default: 20)
346
+ # @option options [Boolean] :include_usage Include usage count
347
+ # @return [Array<Tag>] Matching tags
348
+ def search_tags(query, options = {})
349
+ raise ArgumentError, "Query cannot be empty" if query.to_s.strip.empty?
350
+
351
+ limit = options[:limit] || 20
352
+ search_pattern = "%#{query.downcase}%"
353
+
354
+ base_query = Models::Tag.where(Sequel.ilike(:name, search_pattern))
355
+
356
+ if options[:include_usage]
357
+ base_query = Models::Tag.with_section_count
358
+ .where(Sequel.ilike(:name, search_pattern))
359
+ end
360
+
361
+ base_query.limit(limit).all
362
+ rescue ArgumentError
363
+ raise
364
+ rescue StandardError => e
365
+ log_error("Failed to search tags", e)
366
+ raise ::SmartRAG::Errors::DatabaseError, "Tag search failed: #{e.message}"
367
+ end
368
+
369
+ # Get popular tags
370
+ # @param limit [Integer] Number of results (default: 20)
371
+ # @return [Array<Hash>] Popular tags with usage count
372
+ def get_popular_tags(limit = 20)
373
+ Models::Tag.popular(limit: limit)
374
+ rescue StandardError => e
375
+ log_error("Failed to get popular tags", e)
376
+ raise ::SmartRAG::Errors::DatabaseError, "Popular tags query failed: #{e.message}"
377
+ end
378
+
379
+ # Get tag hierarchy tree
380
+ # @return [Array<Hash>] Hierarchical tag structure
381
+ def get_tag_hierarchy
382
+ Models::Tag.hierarchy
383
+ rescue StandardError => e
384
+ log_error("Failed to get tag hierarchy", e)
385
+ raise ::SmartRAG::Errors::DatabaseError, "Hierarchy query failed: #{e.message}"
386
+ end
387
+
388
+ private
389
+
390
+ def build_tag_generation_prompt(text, topic, languages, options)
391
+ max_category_tags = options[:max_category_tags] || 3
392
+ max_content_tags = options[:max_content_tags] || 5
393
+ include_category = options.fetch(:include_category, true)
394
+ include_content = options.fetch(:include_content, true)
395
+
396
+ prompts = []
397
+
398
+ languages.each do |lang|
399
+ case lang.to_s
400
+ when "zh"
401
+ prompts << build_chinese_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
402
+ when "en"
403
+ prompts << build_english_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
404
+ else
405
+ logger.warn "Unsupported language: #{lang}, skipping"
406
+ end
407
+ end
408
+
409
+ prompts.join("\n\n---\n\n")
410
+ end
411
+
412
+ def build_chinese_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
413
+ prompt = "分析以下文本并生成标签:\n\n"
414
+ prompt += "文本内容:\n#{text}\n\n"
415
+ prompt += "主题:#{topic}\n\n" if topic
416
+
417
+ prompt += "要求:\n"
418
+
419
+ if include_category
420
+ prompt += "1. 生成#{max_category_tags}个以内的高层级分类标签(如:人工智能、机器学习、深度学习等)\n"
421
+ end
422
+
423
+ if include_content
424
+ prompt += "#{include_category ? "2" : "1"}. 生成#{max_content_tags}个以内的具体内容标签(描述文本的关键概念、技术、方法等)\n"
425
+ end
426
+
427
+ prompt += "\n"
428
+ prompt += "以下列JSON格式输出:\n"
429
+ prompt += "{\"categories\": [...], \"content_tags\": [...]}\n"
430
+ prompt += "只输出JSON,不要额外解释。"
431
+
432
+ prompt
433
+ end
434
+
435
+ def build_english_prompt(text, topic, max_category_tags, max_content_tags, include_category, include_content)
436
+ prompt = "Analyze the following text and generate tags:\n\n"
437
+ prompt += "Text content:\n#{text}\n\n"
438
+ prompt += "Topic: #{topic}\n\n" if topic
439
+
440
+ prompt += "Requirements:\n"
441
+
442
+ if include_category
443
+ prompt += "1. Generate up to #{max_category_tags} high-level category tags (e.g., Artificial Intelligence, Machine Learning, Deep Learning)\n"
444
+ end
445
+
446
+ if include_content
447
+ prompt += "#{include_category ? "2" : "1"}. Generate up to #{max_content_tags} specific content tags (describing key concepts, techniques, methods from the text)\n"
448
+ end
449
+
450
+ prompt += "\n"
451
+ prompt += "Output in the following JSON format:\n"
452
+ prompt += '{"categories": [...], "content_tags": [...]}' + "\n"
453
+ prompt += "Output only JSON, no additional explanation."
454
+
455
+ prompt
456
+ end
457
+
458
+ def call_llm_for_tags(prompt, options)
459
+ max_retries = options[:retries] || config[:max_retries]
460
+ timeout = options[:timeout] || config[:timeout]
461
+
462
+ with_retry(max_retries: max_retries, timeout: timeout) do
463
+ result = smart_prompt_engine.call_worker(:analyze_content, { content: prompt })
464
+ raise "No response from LLM" unless result
465
+
466
+ result
467
+ end
468
+ rescue StandardError => e
469
+ logger.error "LLM call failed: #{e.message}"
470
+ raise
471
+ end
472
+
473
+ def parse_generated_tags(response, languages)
474
+ # Handle nil or empty response
475
+ return { categories: [], content_tags: [] } if response.nil? || response.empty?
476
+
477
+ raw_content = extract_response_content(response)
478
+ return { categories: [], content_tags: [] } if raw_content.to_s.strip.empty?
479
+
480
+ # Try to parse as JSON first
481
+ begin
482
+ parsed = JSON.parse(raw_content)
483
+ return {
484
+ categories: (parsed["categories"] || []).map { |c| clean_tag_name(c) }.compact,
485
+ content_tags: (parsed["content_tags"] || []).map { |c| clean_tag_name(c) }.compact,
486
+ }
487
+ rescue JSON::ParserError
488
+ # Fallback to manual parsing
489
+ logger.warn "Failed to parse LLM response as JSON, attempting manual parsing"
490
+ end
491
+
492
+ # Manual parsing for malformed responses
493
+ categories = []
494
+ content_tags = []
495
+
496
+ # Look for JSON-like patterns
497
+ if raw_content =~ /\{\s*"categories"\s*:\s*\[([^\]]+)\]/
498
+ categories_str = $1
499
+ categories = categories_str.scan(/"([^"]+)"/).flatten
500
+ end
501
+
502
+ if raw_content =~ /"content_tags"\s*:\s*\[([^\]]+)\]/
503
+ content_str = $1
504
+ content_tags = content_str.scan(/"([^"]+)"/).flatten
505
+ end
506
+
507
+ # If still no results, try to extract quoted strings
508
+ if categories.empty? && content_tags.empty?
509
+ all_quotes = raw_content.scan(/"([^"]+)"/).flatten
510
+ if all_quotes.any?
511
+ # Heuristic: first few are categories, rest are content tags
512
+ midpoint = [all_quotes.length / 3, 2].max
513
+ categories = all_quotes[0...midpoint]
514
+ content_tags = all_quotes[midpoint..-1]
515
+ end
516
+ end
517
+
518
+ {
519
+ categories: categories.map { |c| clean_tag_name(c) }.compact,
520
+ content_tags: content_tags.map { |c| clean_tag_name(c) }.compact,
521
+ }
522
+ end
523
+
524
+ def extract_response_content(response)
525
+ case response
526
+ when Hash
527
+ response.dig("choices", 0, "message", "content") ||
528
+ response.dig(:choices, 0, :message, :content) ||
529
+ response["content"] ||
530
+ response[:content] ||
531
+ response.to_s
532
+ else
533
+ response.to_s
534
+ end
535
+ end
536
+
537
+ def prepare_section_text(section)
538
+ parts = []
539
+ parts << "Section #{section.section_number}: #{section.section_title}" if section.section_number && section.section_title
540
+ parts << section.section_title if section.section_title && parts.empty?
541
+ parts << section.content
542
+
543
+ parts.compact.join("\n\n")
544
+ end
545
+
546
+ def ensure_tag_objects(tags)
547
+ tags.map do |tag|
548
+ case tag
549
+ when Models::Tag
550
+ tag
551
+ when Integer
552
+ Models::Tag.find(id: tag) || raise(ArgumentError, "Tag not found: #{tag}")
553
+ when String
554
+ Models::Tag.find(name: tag) || raise(ArgumentError, "Tag not found: #{tag}")
555
+ else
556
+ # Support RSpec mocks and other test doubles
557
+ if tag.respond_to?(:id) && tag.respond_to?(:name)
558
+ tag
559
+ else
560
+ raise ArgumentError, "Invalid tag type: #{tag.class}"
561
+ end
562
+ end
563
+ end
564
+ end
565
+
566
+ def clean_tag_name(name)
567
+ return nil if name.nil?
568
+
569
+ cleaned = name.to_s.strip
570
+ return nil if cleaned.empty?
571
+
572
+ # Normalize the tag name
573
+ return cleaned if cleaned.match?(/\p{Han}/)
574
+ return cleaned.gsub(/[^\w\s\-]/, " ").gsub(/\s+/, " ").strip
575
+ end
576
+
577
+ def with_retry(max_retries:, timeout:, &block)
578
+ last_exception = nil
579
+
580
+ max_retries.times do |attempt|
581
+ begin
582
+ Timeout.timeout(timeout) do
583
+ return yield
584
+ end
585
+ rescue StandardError => e
586
+ last_exception = e
587
+ logger.warn "Attempt #{attempt + 1} failed: #{e.message}"
588
+
589
+ # Exponential backoff
590
+ sleep(2 ** attempt) if attempt < max_retries - 1
591
+ end
592
+ end
593
+
594
+ raise last_exception
595
+ end
596
+
597
+ def log_error(message, exception)
598
+ logger.error "#{message}: #{exception.message}"
599
+ logger.error exception.backtrace.join("\n")
600
+ end
601
+
602
+ def default_config
603
+ {
604
+ config_path: "config/llm_config.yml",
605
+ max_retries: 3,
606
+ timeout: 30,
607
+ batch_size: 50,
608
+ max_text_length: 4000,
609
+ logger: Logger.new(STDOUT),
610
+ }
611
+ end
612
+ end
613
+ end
614
+ end