lex-rfp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ci.yml +16 -0
  3. data/.gitignore +12 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +66 -0
  6. data/CHANGELOG.md +15 -0
  7. data/CLAUDE.md +80 -0
  8. data/Dockerfile +6 -0
  9. data/Gemfile +12 -0
  10. data/LICENSE +21 -0
  11. data/README.md +119 -0
  12. data/lex-rfp.gemspec +32 -0
  13. data/lib/legion/extensions/rfp/analytics/client.rb +31 -0
  14. data/lib/legion/extensions/rfp/analytics/helpers/client.rb +24 -0
  15. data/lib/legion/extensions/rfp/analytics/runners/metrics.rb +87 -0
  16. data/lib/legion/extensions/rfp/analytics/runners/quality.rb +121 -0
  17. data/lib/legion/extensions/rfp/analytics/runners/win_rates.rb +88 -0
  18. data/lib/legion/extensions/rfp/analytics.rb +16 -0
  19. data/lib/legion/extensions/rfp/generate/client.rb +31 -0
  20. data/lib/legion/extensions/rfp/generate/helpers/client.rb +24 -0
  21. data/lib/legion/extensions/rfp/generate/runners/drafts.rb +98 -0
  22. data/lib/legion/extensions/rfp/generate/runners/sections.rb +97 -0
  23. data/lib/legion/extensions/rfp/generate/runners/templates.rb +61 -0
  24. data/lib/legion/extensions/rfp/generate.rb +16 -0
  25. data/lib/legion/extensions/rfp/ingest/client.rb +31 -0
  26. data/lib/legion/extensions/rfp/ingest/helpers/client.rb +24 -0
  27. data/lib/legion/extensions/rfp/ingest/runners/corpus.rb +66 -0
  28. data/lib/legion/extensions/rfp/ingest/runners/documents.rb +86 -0
  29. data/lib/legion/extensions/rfp/ingest/runners/parser.rb +84 -0
  30. data/lib/legion/extensions/rfp/ingest.rb +16 -0
  31. data/lib/legion/extensions/rfp/review/client.rb +31 -0
  32. data/lib/legion/extensions/rfp/review/helpers/client.rb +24 -0
  33. data/lib/legion/extensions/rfp/review/runners/approvals.rb +70 -0
  34. data/lib/legion/extensions/rfp/review/runners/comments.rb +76 -0
  35. data/lib/legion/extensions/rfp/review/runners/workflows.rb +86 -0
  36. data/lib/legion/extensions/rfp/review.rb +16 -0
  37. data/lib/legion/extensions/rfp/version.rb +9 -0
  38. data/lib/legion/extensions/rfp.rb +15 -0
  39. metadata +99 -0
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Rfp
6
+ module Analytics
7
+ module Runners
8
+ module WinRates
9
+ extend Legion::Extensions::Rfp::Analytics::Helpers::Client
10
+
11
+ def overall_win_rate(proposals:, **)
12
+ decided = proposals.select { |p| %i[won lost].include?(p[:outcome]) }
13
+ return { result: 0.0, decided: 0, total: proposals.length } if decided.empty?
14
+
15
+ won = decided.count { |p| p[:outcome] == :won }
16
+ { result: (won.to_f / decided.length).round(4), won: won, decided: decided.length }
17
+ end
18
+
19
+ def win_rate_by_source(proposals:, **)
20
+ grouped = proposals.group_by { |p| p[:rfp_source] }
21
+ rates = grouped.transform_values do |group|
22
+ decided = group.select { |p| %i[won lost].include?(p[:outcome]) }
23
+ next { rate: 0.0, decided: 0 } if decided.empty?
24
+
25
+ won = decided.count { |p| p[:outcome] == :won }
26
+ { rate: (won.to_f / decided.length).round(4), won: won, decided: decided.length }
27
+ end
28
+
29
+ { result: rates }
30
+ end
31
+
32
+ def win_rate_by_template(proposals:, **)
33
+ grouped = proposals.group_by { |p| p[:template] }
34
+ rates = grouped.transform_values do |group|
35
+ decided = group.select { |p| %i[won lost].include?(p[:outcome]) }
36
+ next { rate: 0.0, decided: 0 } if decided.empty?
37
+
38
+ won = decided.count { |p| p[:outcome] == :won }
39
+ { rate: (won.to_f / decided.length).round(4), won: won, decided: decided.length }
40
+ end
41
+
42
+ { result: rates }
43
+ end
44
+
45
+ def trend(proposals:, period: :monthly, **)
46
+ sorted = proposals.sort_by { |p| p[:submitted_at] || '' }
47
+ grouped = case period
48
+ when :monthly
49
+ sorted.group_by { |p| p[:submitted_at]&.slice(0, 7) }
50
+ when :quarterly
51
+ sorted.group_by { |p| quarter_key(p[:submitted_at]) }
52
+ else
53
+ sorted.group_by { |p| p[:submitted_at]&.slice(0, 4) }
54
+ end
55
+
56
+ trend_data = grouped.transform_values do |group|
57
+ decided = group.select { |p| %i[won lost].include?(p[:outcome]) }
58
+ won = decided.count { |p| p[:outcome] == :won }
59
+ {
60
+ total: group.length,
61
+ decided: decided.length,
62
+ won: won,
63
+ rate: decided.empty? ? 0.0 : (won.to_f / decided.length).round(4)
64
+ }
65
+ end
66
+
67
+ { result: trend_data, period: period }
68
+ end
69
+
70
+ private
71
+
72
+ def quarter_key(date_str)
73
+ return nil unless date_str
74
+
75
+ year = date_str[0, 4]
76
+ month = date_str[5, 2].to_i
77
+ quarter = ((month - 1) / 3) + 1
78
+ "#{year}-Q#{quarter}"
79
+ end
80
+
81
+ include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
82
+ Legion::Extensions::Helpers.const_defined?(:Lex)
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'legion/extensions/rfp/analytics/helpers/client'
4
+ require 'legion/extensions/rfp/analytics/runners/metrics'
5
+ require 'legion/extensions/rfp/analytics/runners/win_rates'
6
+ require 'legion/extensions/rfp/analytics/runners/quality'
7
+ require 'legion/extensions/rfp/analytics/client'
8
+
9
+ module Legion
10
+ module Extensions
11
+ module Rfp
12
+ module Analytics
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'helpers/client'
4
+ require_relative 'runners/drafts'
5
+ require_relative 'runners/sections'
6
+ require_relative 'runners/templates'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Rfp
11
+ module Generate
12
+ class Client
13
+ include Helpers::Client
14
+ include Runners::Drafts
15
+ include Runners::Sections
16
+ include Runners::Templates
17
+
18
+ attr_reader :opts
19
+
20
+ def initialize(base_url: nil, token: nil, **)
21
+ @opts = { base_url: base_url, token: token }.compact
22
+ end
23
+
24
+ def client(**override)
25
+ super(**@opts, **override)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+
5
+ module Legion
6
+ module Extensions
7
+ module Rfp
8
+ module Generate
9
+ module Helpers
10
+ module Client
11
+ def client(base_url: 'http://localhost:4567', token: nil, **)
12
+ Faraday.new(url: base_url) do |conn|
13
+ conn.request :json
14
+ conn.response :json, content_type: /\bjson$/
15
+ conn.headers['Content-Type'] = 'application/json'
16
+ conn.headers['Authorization'] = "Bearer #{token}" if token
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Rfp
6
+ module Generate
7
+ module Runners
8
+ module Drafts
9
+ extend Legion::Extensions::Rfp::Generate::Helpers::Client
10
+
11
+ def generate_full_draft(rfp_text:, context: {}, model: nil, **)
12
+ questions = parse_rfp(rfp_text)
13
+ responses = questions.map do |question|
14
+ generate_section_response(question: question[:question], section: question[:section], context: context,
15
+ model: model)
16
+ end
17
+
18
+ draft = responses.map { |r| r[:result] }.join("\n\n---\n\n")
19
+ { result: draft, sections: responses.length, questions_answered: responses.length }
20
+ end
21
+
22
+ def generate_response(question:, context: {}, model: nil, scope: :all, **)
23
+ retrieved = retrieve_context(question: question, scope: scope)
24
+ prompt = build_prompt(question: question, context: context, retrieved: retrieved)
25
+
26
+ answer = call_llm(prompt: prompt, model: model)
27
+ { result: answer, context_used: retrieved.length, question: question }
28
+ end
29
+
30
+ def regenerate(question:, previous_answer:, feedback:, context: {}, model: nil, **)
31
+ prompt = build_revision_prompt(
32
+ question: question,
33
+ previous: previous_answer,
34
+ feedback: feedback,
35
+ context: context
36
+ )
37
+
38
+ answer = call_llm(prompt: prompt, model: model)
39
+ { result: answer, question: question, revision: true }
40
+ end
41
+
42
+ private
43
+
44
+ def parse_rfp(text)
45
+ obj = Object.new
46
+ obj.extend(Legion::Extensions::Rfp::Ingest::Runners::Parser)
47
+ parsed = obj.parse_rfp_questions(text: text)
48
+ parsed[:result]
49
+ end
50
+
51
+ def retrieve_context(question:, scope:)
52
+ return [] unless defined?(Legion::Apollo)
53
+
54
+ result = Legion::Apollo.retrieve(query: question, scope: scope, limit: 5)
55
+ result.is_a?(Array) ? result : []
56
+ end
57
+
58
+ def build_prompt(question:, context:, retrieved:)
59
+ parts = ['You are an expert proposal writer for a healthcare organization.']
60
+ parts << 'Use the following reference material to craft your response:'
61
+
62
+ retrieved.each_with_index do |doc, idx|
63
+ parts << "\n--- Reference #{idx + 1} ---\n#{doc[:content] || doc['content']}"
64
+ end
65
+
66
+ parts << "\nAdditional context: #{context.inspect}" unless context.empty?
67
+ parts << "\nQuestion: #{question}"
68
+ parts << "\nProvide a professional, detailed response suitable for an RFP submission."
69
+ parts.join("\n")
70
+ end
71
+
72
+ def build_revision_prompt(question:, previous:, feedback:, context:)
73
+ parts = ['You are revising an RFP response based on reviewer feedback.']
74
+ parts << "\nOriginal question: #{question}"
75
+ parts << "\nPrevious answer:\n#{previous}"
76
+ parts << "\nReviewer feedback: #{feedback}"
77
+ parts << "\nAdditional context: #{context.inspect}" unless context.empty?
78
+ parts << "\nProvide an improved response incorporating the feedback."
79
+ parts.join("\n")
80
+ end
81
+
82
+ def call_llm(prompt:, model: nil) # rubocop:disable Lint/UnusedMethodArgument
83
+ if defined?(Legion::LLM)
84
+ result = Legion::LLM.ask(message: prompt)
85
+ result.is_a?(Hash) ? (result[:content] || result[:result] || result.to_s) : result.to_s
86
+ else
87
+ "[LLM not available] Prompt: #{prompt[0..100]}..."
88
+ end
89
+ end
90
+
91
+ include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
92
+ Legion::Extensions::Helpers.const_defined?(:Lex)
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Rfp
6
+ module Generate
7
+ module Runners
8
+ module Sections
9
+ extend Legion::Extensions::Rfp::Generate::Helpers::Client
10
+
11
+ def generate_section_response(question:, section: nil, context: {}, model: nil, scope: :all, **)
12
+ retrieved = retrieve_section_context(question: question, section: section, scope: scope)
13
+ prompt = build_section_prompt(question: question, section: section, context: context,
14
+ retrieved: retrieved)
15
+
16
+ answer = call_section_llm(prompt: prompt, model: model)
17
+ {
18
+ result: answer,
19
+ section: section,
20
+ question: question,
21
+ context_used: retrieved.length
22
+ }
23
+ end
24
+
25
+ def generate_executive_summary(rfp_text:, company_context: {}, model: nil, **)
26
+ prompt = build_executive_summary_prompt(rfp_text: rfp_text, company_context: company_context)
27
+ answer = call_section_llm(prompt: prompt, model: model)
28
+ { result: answer, type: :executive_summary }
29
+ end
30
+
31
+ def generate_compliance_matrix(requirements:, capabilities: {}, model: nil, **)
32
+ prompt = build_compliance_prompt(requirements: requirements, capabilities: capabilities)
33
+ answer = call_section_llm(prompt: prompt, model: model)
34
+ { result: answer, type: :compliance_matrix, requirements_count: requirements.length }
35
+ end
36
+
37
+ private
38
+
39
+ def retrieve_section_context(question:, section:, scope:)
40
+ return [] unless defined?(Legion::Apollo)
41
+
42
+ query = [section, question].compact.join(' - ')
43
+ result = Legion::Apollo.retrieve(query: query, scope: scope, limit: 5)
44
+ result.is_a?(Array) ? result : []
45
+ end
46
+
47
+ def build_section_prompt(question:, section:, context:, retrieved:)
48
+ parts = ['You are writing a specific section of an RFP response.']
49
+ parts << "Section: #{section}" if section
50
+
51
+ retrieved.each_with_index do |doc, idx|
52
+ parts << "\n--- Reference #{idx + 1} ---\n#{doc[:content] || doc['content']}"
53
+ end
54
+
55
+ parts << "\nAdditional context: #{context.inspect}" unless context.empty?
56
+ parts << "\nQuestion: #{question}"
57
+ parts << "\nProvide a focused, professional response for this section."
58
+ parts.join("\n")
59
+ end
60
+
61
+ def build_executive_summary_prompt(rfp_text:, company_context:)
62
+ parts = ['Write an executive summary for the following RFP response.']
63
+ parts << "\nCompany context: #{company_context.inspect}" unless company_context.empty?
64
+ parts << "\nRFP overview:\n#{rfp_text[0..2000]}"
65
+ parts << "\nWrite a compelling 2-3 paragraph executive summary."
66
+ parts.join("\n")
67
+ end
68
+
69
+ def build_compliance_prompt(requirements:, capabilities:)
70
+ parts = ['Generate a compliance matrix for the following requirements.']
71
+ parts << "\nCapabilities: #{capabilities.inspect}" unless capabilities.empty?
72
+
73
+ requirements.each_with_index do |req, idx|
74
+ parts << "#{idx + 1}. #{req[:text] || req}"
75
+ end
76
+
77
+ parts << "\nFor each requirement, indicate: Compliant, Partially Compliant, or Non-Compliant with explanation."
78
+ parts.join("\n")
79
+ end
80
+
81
+ def call_section_llm(prompt:, model: nil) # rubocop:disable Lint/UnusedMethodArgument
82
+ if defined?(Legion::LLM)
83
+ result = Legion::LLM.ask(message: prompt)
84
+ result.is_a?(Hash) ? (result[:content] || result[:result] || result.to_s) : result.to_s
85
+ else
86
+ "[LLM not available] Prompt: #{prompt[0..100]}..."
87
+ end
88
+ end
89
+
90
+ include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
91
+ Legion::Extensions::Helpers.const_defined?(:Lex)
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Rfp
6
+ module Generate
7
+ module Runners
8
+ module Templates
9
+ extend Legion::Extensions::Rfp::Generate::Helpers::Client
10
+
11
+ DEFAULT_TEMPLATES = {
12
+ standard: { sections: %i[executive_summary company_overview approach timeline pricing], tone: :formal },
13
+ government: { sections: %i[executive_summary compliance technical_approach management staffing pricing],
14
+ tone: :formal },
15
+ healthcare: { sections: %i[executive_summary clinical_approach quality_measures compliance network
16
+ implementation pricing], tone: :formal }
17
+ }.freeze
18
+
19
+ def list_templates(**)
20
+ { result: DEFAULT_TEMPLATES.keys, count: DEFAULT_TEMPLATES.keys.length }
21
+ end
22
+
23
+ def get_template(name:, **)
24
+ template = DEFAULT_TEMPLATES[name.to_sym]
25
+ return { result: nil, error: "Template not found: #{name}" } unless template
26
+
27
+ { result: template, name: name }
28
+ end
29
+
30
+ def apply_template(name:, rfp_data:, **)
31
+ template = DEFAULT_TEMPLATES[name.to_sym]
32
+ return { result: nil, error: "Template not found: #{name}" } unless template
33
+
34
+ outline = template[:sections].map do |section|
35
+ { section: section, tone: template[:tone], content: rfp_data[section] }
36
+ end
37
+
38
+ { result: outline, template: name, sections: outline.length }
39
+ end
40
+
41
+ def suggest_template(rfp_text:, **)
42
+ text_lower = rfp_text.downcase
43
+ suggested = if text_lower.match?(/\b(?:medicare|medicaid|clinical|hipaa|phi|health)\b/)
44
+ :healthcare
45
+ elsif text_lower.match?(/\b(?:federal|government|agency|cfr|far|dfars)\b/)
46
+ :government
47
+ else
48
+ :standard
49
+ end
50
+
51
+ { result: suggested, confidence: :heuristic }
52
+ end
53
+
54
+ include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
55
+ Legion::Extensions::Helpers.const_defined?(:Lex)
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'legion/extensions/rfp/generate/helpers/client'
4
+ require 'legion/extensions/rfp/generate/runners/drafts'
5
+ require 'legion/extensions/rfp/generate/runners/sections'
6
+ require 'legion/extensions/rfp/generate/runners/templates'
7
+ require 'legion/extensions/rfp/generate/client'
8
+
9
+ module Legion
10
+ module Extensions
11
+ module Rfp
12
+ module Generate
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'helpers/client'
4
+ require_relative 'runners/documents'
5
+ require_relative 'runners/corpus'
6
+ require_relative 'runners/parser'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Rfp
11
+ module Ingest
12
+ class Client
13
+ include Helpers::Client
14
+ include Runners::Documents
15
+ include Runners::Corpus
16
+ include Runners::Parser
17
+
18
+ attr_reader :opts
19
+
20
+ def initialize(base_url: nil, token: nil, **)
21
+ @opts = { base_url: base_url, token: token }.compact
22
+ end
23
+
24
+ def client(**override)
25
+ super(**@opts, **override)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+
5
+ module Legion
6
+ module Extensions
7
+ module Rfp
8
+ module Ingest
9
+ module Helpers
10
+ module Client
11
+ def client(base_url: 'http://localhost:4567', token: nil, **)
12
+ Faraday.new(url: base_url) do |conn|
13
+ conn.request :json
14
+ conn.response :json, content_type: /\bjson$/
15
+ conn.headers['Content-Type'] = 'application/json'
16
+ conn.headers['Authorization'] = "Bearer #{token}" if token
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Rfp
6
+ module Ingest
7
+ module Runners
8
+ module Corpus
9
+ extend Legion::Extensions::Rfp::Ingest::Helpers::Client
10
+
11
+ def ingest_document(file_path:, tags: [], metadata: {}, **)
12
+ supported = supported?(file_path: file_path)
13
+ return { result: nil, error: "Unsupported format: #{file_path}" } unless supported[:result]
14
+
15
+ extracted = extract_text(file_path: file_path)
16
+ chunked = chunk_text(text: extracted[:result])
17
+
18
+ ingested = chunked[:result].map.with_index do |chunk, idx|
19
+ {
20
+ content: chunk[:text],
21
+ source: file_path,
22
+ chunk_id: idx,
23
+ tags: tags,
24
+ metadata: metadata.merge(format: extracted[:format], offset: chunk[:offset])
25
+ }
26
+ end
27
+
28
+ { result: ingested, count: ingested.length, source: file_path }
29
+ end
30
+
31
+ def ingest_directory(directory:, tags: [], recursive: true, **)
32
+ pattern = recursive ? ::File.join(directory, '**', '*') : ::File.join(directory, '*')
33
+ files = Dir.glob(pattern).select { |f| ::File.file?(f) }
34
+
35
+ results = files.filter_map do |file_path|
36
+ next unless supported?(file_path: file_path)[:result]
37
+
38
+ ingest_document(file_path: file_path, tags: tags)
39
+ end
40
+
41
+ { result: results, files_processed: results.length, total_chunks: results.sum { |r| r[:count] } }
42
+ end
43
+
44
+ def ingest_to_apollo(chunks:, scope: :global, **)
45
+ return { result: nil, error: 'Apollo not available' } unless defined?(Legion::Apollo)
46
+
47
+ ingested = chunks.map do |chunk|
48
+ Legion::Apollo.ingest(
49
+ content: chunk[:content],
50
+ tags: chunk[:tags] || [],
51
+ metadata: chunk[:metadata] || {},
52
+ scope: scope
53
+ )
54
+ end
55
+
56
+ { result: ingested, count: ingested.length }
57
+ end
58
+
59
+ include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
60
+ Legion::Extensions::Helpers.const_defined?(:Lex)
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Rfp
6
+ module Ingest
7
+ module Runners
8
+ module Documents
9
+ extend Legion::Extensions::Rfp::Ingest::Helpers::Client
10
+
11
+ SUPPORTED_FORMATS = %w[pdf docx md markdown xlsx html htm].freeze
12
+
13
+ def supported?(file_path:, **)
14
+ ext = ::File.extname(file_path.to_s).delete('.').downcase
15
+ { result: SUPPORTED_FORMATS.include?(ext), format: ext }
16
+ end
17
+
18
+ def extract_text(file_path:, format: nil, **)
19
+ fmt = format || ::File.extname(file_path.to_s).delete('.').downcase
20
+ content = case fmt
21
+ when 'pdf' then extract_pdf(file_path)
22
+ when 'docx' then extract_docx(file_path)
23
+ when 'md', 'markdown' then ::File.read(file_path)
24
+ when 'xlsx' then extract_xlsx(file_path)
25
+ when 'html', 'htm' then extract_html(file_path)
26
+ else raise ArgumentError, "Unsupported format: #{fmt}"
27
+ end
28
+ { result: content, format: fmt, size: content.length }
29
+ end
30
+
31
+ def chunk_text(text:, chunk_size: 1000, overlap: 200, **)
32
+ return { result: [], count: 0 } if text.nil? || text.empty?
33
+
34
+ chunks = []
35
+ pos = 0
36
+ while pos < text.length
37
+ chunk = text[pos, chunk_size]
38
+ chunks << { text: chunk, offset: pos, length: chunk.length }
39
+ pos += (chunk_size - overlap)
40
+ end
41
+ { result: chunks, count: chunks.length }
42
+ end
43
+
44
+ private
45
+
46
+ def extract_pdf(file_path)
47
+ if defined?(Legion::Data::Extract)
48
+ Legion::Data::Extract.call(file_path, :pdf)
49
+ else
50
+ "[PDF extraction requires legion-data] #{file_path}"
51
+ end
52
+ end
53
+
54
+ def extract_docx(file_path)
55
+ if defined?(Legion::Data::Extract)
56
+ Legion::Data::Extract.call(file_path, :docx)
57
+ else
58
+ "[DOCX extraction requires legion-data] #{file_path}"
59
+ end
60
+ end
61
+
62
+ def extract_xlsx(file_path)
63
+ if defined?(Legion::Data::Extract)
64
+ Legion::Data::Extract.call(file_path, :xlsx)
65
+ else
66
+ "[Excel extraction requires legion-data] #{file_path}"
67
+ end
68
+ end
69
+
70
+ def extract_html(file_path)
71
+ content = ::File.read(file_path)
72
+ content.gsub(%r{<script[^>]*>.*?</script>}mi, '')
73
+ .gsub(%r{<style[^>]*>.*?</style>}mi, '')
74
+ .gsub(/<[^>]+>/, ' ')
75
+ .gsub(/\s+/, ' ')
76
+ .strip
77
+ end
78
+
79
+ include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
80
+ Legion::Extensions::Helpers.const_defined?(:Lex)
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end