bx_builder_chain 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +120 -0
- data/README.md +74 -0
- data/Rakefile +12 -0
- data/bx_builder_chain.gemspec +35 -0
- data/lib/bx_builder_chain/chunker/recursive_text.rb +38 -0
- data/lib/bx_builder_chain/chunker/text.rb +38 -0
- data/lib/bx_builder_chain/configuration.rb +21 -0
- data/lib/bx_builder_chain/data.rb +28 -0
- data/lib/bx_builder_chain/dependency_helper.rb +22 -0
- data/lib/bx_builder_chain/llm/base.rb +64 -0
- data/lib/bx_builder_chain/llm/open_ai.rb +191 -0
- data/lib/bx_builder_chain/loader.rb +144 -0
- data/lib/bx_builder_chain/processors/base.rb +21 -0
- data/lib/bx_builder_chain/processors/csv.rb +27 -0
- data/lib/bx_builder_chain/processors/docx.rb +25 -0
- data/lib/bx_builder_chain/processors/html.rb +29 -0
- data/lib/bx_builder_chain/processors/json.rb +17 -0
- data/lib/bx_builder_chain/processors/pdf.rb +26 -0
- data/lib/bx_builder_chain/processors/text.rb +17 -0
- data/lib/bx_builder_chain/processors/xlsx.rb +31 -0
- data/lib/bx_builder_chain/utils/token_data/cl100k_base.tiktoken +100256 -0
- data/lib/bx_builder_chain/utils/token_length/base_validator.rb +45 -0
- data/lib/bx_builder_chain/utils/token_length/open_ai_validator.rb +70 -0
- data/lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb +72 -0
- data/lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb +44 -0
- data/lib/bx_builder_chain/vectorsearch/base.rb +160 -0
- data/lib/bx_builder_chain/vectorsearch/pgvector.rb +228 -0
- data/lib/bx_builder_chain/version.rb +5 -0
- data/lib/bx_builder_chain.rb +38 -0
- data/lib/generators/bx_builder_chain/install_generator.rb +42 -0
- data/lib/generators/bx_builder_chain/templates/app/admin/bx_builder_chain_document.rb +65 -0
- data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/documents_controller.rb +65 -0
- data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/questions_controller.rb +33 -0
- data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/test_controller.rb +10 -0
- data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document.rb +26 -0
- data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document_chunk.rb +9 -0
- data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/embedding.rb +9 -0
- data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/document_upload_service.rb +47 -0
- data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/question_asking_service.rb +35 -0
- data/lib/generators/bx_builder_chain/templates/app/views/bx_builder_chain/test/form.html.erb +164 -0
- data/lib/generators/bx_builder_chain/templates/app/workers/bx_builder_chain/document_processor_worker.rb +32 -0
- data/lib/generators/bx_builder_chain/templates/initializer.rb +12 -0
- data/lib/generators/bx_builder_chain/templates/migration.rb +33 -0
- data/lib/pgvector/pg/binary_decoder/vector.rb +14 -0
- data/lib/pgvector/pg/text_decoder/vector.rb +12 -0
- data/lib/pgvector/pg.rb +10 -0
- data/lib/pgvector.rb +11 -0
- data/lib/sequel/plugins/pgvector/class_methods.rb +47 -0
- data/lib/sequel/plugins/pgvector/instance_methods.rb +34 -0
- data/lib/sequel/plugins/pgvector.rb +12 -0
- data/sig/bx_langchain_chat.rbs +4 -0
- metadata +238 -0
@@ -0,0 +1,191 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain::Llm
|
4
|
+
# LLM interface for OpenAI APIs: https://platform.openai.com/overview
|
5
|
+
#
|
6
|
+
# Gem requirements:
|
7
|
+
# gem "ruby-openai", "~> 4.0.0"
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
# openai = BxBuilderChain::LLM::OpenAI.new(api_key:, llm_options: {})
|
11
|
+
#
|
12
|
+
class OpenAi < Base
|
13
|
+
DEFAULTS = {
|
14
|
+
temperature: 0.0,
|
15
|
+
completion_model_name: "text-davinci-003",
|
16
|
+
chat_completion_model_name: "gpt-3.5-turbo",
|
17
|
+
embeddings_model_name: "text-embedding-ada-002",
|
18
|
+
dimension: 1536
|
19
|
+
}.freeze
|
20
|
+
LENGTH_VALIDATOR = BxBuilderChain::Utils::TokenLength::OpenAiValidator
|
21
|
+
ROLE_MAPPING = {
|
22
|
+
"ai" => "assistant",
|
23
|
+
"human" => "user"
|
24
|
+
}
|
25
|
+
|
26
|
+
attr_accessor :functions
|
27
|
+
|
28
|
+
def initialize(api_key: BxBuilderChain.configuration.openai_api_key, llm_options: {}, default_options: {})
|
29
|
+
depends_on "ruby-openai"
|
30
|
+
require "openai"
|
31
|
+
|
32
|
+
@client = ::OpenAI::Client.new(access_token: api_key, **llm_options)
|
33
|
+
@defaults = DEFAULTS.merge(default_options)
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Generate an embedding for a given text
|
38
|
+
#
|
39
|
+
# @param text [String] The text to generate an embedding for
|
40
|
+
# @param params extra parameters passed to OpenAI::Client#embeddings
|
41
|
+
# @return [Array] The embedding
|
42
|
+
#
|
43
|
+
def embed(text:, **params)
|
44
|
+
parameters = {model: @defaults[:embeddings_model_name], input: text}
|
45
|
+
|
46
|
+
validate_max_tokens(text, parameters[:model])
|
47
|
+
|
48
|
+
response = client.embeddings(parameters: parameters.merge(params))
|
49
|
+
response.dig("data").first.dig("embedding")
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Generate a completion for a given prompt
|
54
|
+
#
|
55
|
+
# @param prompt [String] The prompt to generate a completion for
|
56
|
+
# @param params extra parameters passed to OpenAI::Client#complete
|
57
|
+
# @return [String] The completion
|
58
|
+
#
|
59
|
+
def complete(prompt:, **params)
|
60
|
+
parameters = compose_parameters @defaults[:completion_model_name], params
|
61
|
+
|
62
|
+
parameters[:prompt] = prompt
|
63
|
+
parameters[:max_tokens] = validate_max_tokens(prompt, parameters[:model])
|
64
|
+
|
65
|
+
response = client.completions(parameters: parameters)
|
66
|
+
response.dig("choices", 0, "text")
|
67
|
+
end
|
68
|
+
|
69
|
+
#
|
70
|
+
# Generate a chat completion for a given prompt or messages.
|
71
|
+
#
|
72
|
+
# == Examples
|
73
|
+
#
|
74
|
+
# # simplest case, just give a prompt
|
75
|
+
# openai.chat prompt: "When was Ruby first released?"
|
76
|
+
#
|
77
|
+
# # prompt plus some context about how to respond
|
78
|
+
# openai.chat context: "You are RubyGPT, a helpful chat bot for helping people learn Ruby", prompt: "Does Ruby have a REPL like IPython?"
|
79
|
+
#
|
80
|
+
# # full control over messages that get sent, equivilent to the above
|
81
|
+
# openai.chat messages: [
|
82
|
+
# {
|
83
|
+
# role: "system",
|
84
|
+
# content: "You are RubyGPT, a helpful chat bot for helping people learn Ruby", prompt: "Does Ruby have a REPL like IPython?"
|
85
|
+
# },
|
86
|
+
# {
|
87
|
+
# role: "user",
|
88
|
+
# content: "When was Ruby first released?"
|
89
|
+
# }
|
90
|
+
# ]
|
91
|
+
#
|
92
|
+
# # few-short prompting with examples
|
93
|
+
# openai.chat prompt: "When was factory_bot released?",
|
94
|
+
# examples: [
|
95
|
+
# {
|
96
|
+
# role: "user",
|
97
|
+
# content: "When was Ruby on Rails released?"
|
98
|
+
# }
|
99
|
+
# {
|
100
|
+
# role: "assistant",
|
101
|
+
# content: "2004"
|
102
|
+
# },
|
103
|
+
# ]
|
104
|
+
#
|
105
|
+
# @param prompt [HumanMessage] The prompt to generate a chat completion for
|
106
|
+
# @param messages [Array<AIMessage|HumanMessage>] The messages that have been sent in the conversation
|
107
|
+
# @param context [SystemMessage] An initial context to provide as a system message, ie "You are RubyGPT, a helpful chat bot for helping people learn Ruby"
|
108
|
+
# @param examples [Array<AIMessage|HumanMessage>] Examples of messages to provide to the model. Useful for Few-Shot Prompting
|
109
|
+
# @param options [Hash] extra parameters passed to OpenAI::Client#chat
|
110
|
+
# @yield [AIMessage] Stream responses back one String at a time
|
111
|
+
# @return [AIMessage] The chat completion
|
112
|
+
#
|
113
|
+
def chat(prompt: "", messages: [], context: "", examples: [], **options)
|
114
|
+
raise ArgumentError.new(":prompt or :messages argument is expected") if prompt.empty? && messages.empty?
|
115
|
+
|
116
|
+
parameters = compose_parameters @defaults[:chat_completion_model_name], options
|
117
|
+
parameters[:messages] = compose_chat_messages(prompt: prompt, messages: messages, context: context, examples: examples)
|
118
|
+
|
119
|
+
if functions
|
120
|
+
parameters[:functions] = functions
|
121
|
+
else
|
122
|
+
parameters[:max_tokens] = validate_max_tokens(parameters[:messages], parameters[:model])
|
123
|
+
end
|
124
|
+
|
125
|
+
response = client.chat(parameters: parameters)
|
126
|
+
|
127
|
+
response.dig("choices", 0, "message", "content")
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Generate a summary for a given text
|
132
|
+
#
|
133
|
+
# @param text [String] The text to generate a summary for
|
134
|
+
# @return [String] The summary
|
135
|
+
#
|
136
|
+
# def summarize(text:)
|
137
|
+
# prompt_template = BxBuilderChain::Prompt.load_from_path(
|
138
|
+
# file_path: BxBuilderChain.root.join("langchain/llm/prompts/summarize_template.yaml")
|
139
|
+
# )
|
140
|
+
# prompt = prompt_template.format(text: text)
|
141
|
+
|
142
|
+
# complete(prompt: prompt, temperature: @defaults[:temperature])
|
143
|
+
# end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def compose_parameters(model, params)
|
148
|
+
default_params = {model: model, temperature: @defaults[:temperature]}
|
149
|
+
|
150
|
+
default_params[:stop] = params.delete(:stop_sequences) if params[:stop_sequences]
|
151
|
+
|
152
|
+
default_params.merge(params)
|
153
|
+
end
|
154
|
+
|
155
|
+
def compose_chat_messages(prompt:, messages:, context:, examples:)
|
156
|
+
history = []
|
157
|
+
|
158
|
+
history.concat transform_messages(examples) unless examples.empty?
|
159
|
+
|
160
|
+
history.concat transform_messages(messages) unless messages.empty?
|
161
|
+
|
162
|
+
unless context.nil? || context.to_s.empty?
|
163
|
+
history.reject! { |message| message[:role] == "system" }
|
164
|
+
history.prepend({role: "system", content: context.content})
|
165
|
+
end
|
166
|
+
|
167
|
+
unless prompt.empty?
|
168
|
+
if history.last && history.last[:role] == "user"
|
169
|
+
history.last[:content] += "\n#{prompt}"
|
170
|
+
else
|
171
|
+
history.append({role: "user", content: prompt})
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
history
|
176
|
+
end
|
177
|
+
|
178
|
+
def transform_messages(messages)
|
179
|
+
messages.map do |message|
|
180
|
+
{
|
181
|
+
role: ROLE_MAPPING.fetch(message.type, message.type),
|
182
|
+
content: message.content
|
183
|
+
}
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def validate_max_tokens(messages, model)
|
188
|
+
LENGTH_VALIDATOR.validate_max_tokens!(messages, model)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "open-uri"
|
4
|
+
|
5
|
+
module BxBuilderChain
|
6
|
+
class Loader
|
7
|
+
class FileNotFound < StandardError; end
|
8
|
+
|
9
|
+
class UnknownFormatError < StandardError; end
|
10
|
+
|
11
|
+
URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
|
12
|
+
|
13
|
+
# Load data from a file or URL. Shorthand for `BxBuilderChain::Loader.new(path).load`
|
14
|
+
#
|
15
|
+
# == Examples
|
16
|
+
#
|
17
|
+
# # load a URL
|
18
|
+
# data = BxBuilderChain::Loader.load("https://example.com/docs/README.md")
|
19
|
+
#
|
20
|
+
# # load a file
|
21
|
+
# data = BxBuilderChain::Loader.load("README.md")
|
22
|
+
#
|
23
|
+
# # Load data using a custom processor
|
24
|
+
# data = BxBuilderChain::Loader.load("README.md") do |raw_data, options|
|
25
|
+
# # your processing code goes here
|
26
|
+
# # return data at the end here
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# @param path [String | Pathname] path to file or URL
|
30
|
+
# @param options [Hash] options passed to the processor class used to process the data
|
31
|
+
# @return [Data] data loaded from path
|
32
|
+
def self.load(path, options = {}, &block)
|
33
|
+
new(path, options).load(&block)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Initialize BxBuilderChain::Loader
|
37
|
+
# @param path [String | Pathname] path to file or URL
|
38
|
+
# @param options [Hash] options passed to the processor class used to process the data
|
39
|
+
# @return [BxBuilderChain::Loader] loader instance
|
40
|
+
def initialize(path, options = {})
|
41
|
+
@options = options
|
42
|
+
@path = path
|
43
|
+
end
|
44
|
+
|
45
|
+
# Is the path a URL?
|
46
|
+
#
|
47
|
+
# @return [Boolean] true if path is URL
|
48
|
+
def url?
|
49
|
+
return false if @path.is_a?(Pathname)
|
50
|
+
|
51
|
+
!!(@path =~ URI_REGEX)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Is the path a directory
|
55
|
+
#
|
56
|
+
# @return [Boolean] true if path is a directory
|
57
|
+
def directory?
|
58
|
+
File.directory?(@path)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Load data from a file or URL
|
62
|
+
#
|
63
|
+
# loader = BxBuilderChain::Loader.new("README.md")
|
64
|
+
# # Load data using default processor for the file
|
65
|
+
# loader.load
|
66
|
+
#
|
67
|
+
# # Load data using a custom processor
|
68
|
+
# loader.load do |raw_data, options|
|
69
|
+
# # your processing code goes here
|
70
|
+
# # return data at the end here
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
# @yield [String, Hash] handle parsing raw output into string directly
|
74
|
+
# @yieldparam [String] raw_data from the loaded URL or file
|
75
|
+
# @yieldreturn [String] parsed data, as a String
|
76
|
+
#
|
77
|
+
# @return [Data] data that was loaded
|
78
|
+
def load(&block)
|
79
|
+
return process_data(load_from_url, &block) if url?
|
80
|
+
return load_from_directory(&block) if directory?
|
81
|
+
|
82
|
+
process_data(load_from_path, &block)
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def load_from_url
|
88
|
+
URI.parse(@path).open
|
89
|
+
end
|
90
|
+
|
91
|
+
def load_from_path
|
92
|
+
return File.open(@path) if File.exist?(@path)
|
93
|
+
|
94
|
+
raise FileNotFound, "File #{@path} does not exist"
|
95
|
+
end
|
96
|
+
|
97
|
+
def load_from_directory(&block)
|
98
|
+
Dir.glob(File.join(@path, "**/*")).map do |file|
|
99
|
+
# Only load and add to result files with supported extensions
|
100
|
+
BxBuilderChain::Loader.new(file, @options).load(&block)
|
101
|
+
rescue
|
102
|
+
UnknownFormatError nil
|
103
|
+
end.flatten.compact
|
104
|
+
end
|
105
|
+
|
106
|
+
def process_data(data, &block)
|
107
|
+
@raw_data = data
|
108
|
+
|
109
|
+
result = if block
|
110
|
+
yield @raw_data.read, @options
|
111
|
+
else
|
112
|
+
processor_klass.new(@options).parse(@raw_data)
|
113
|
+
end
|
114
|
+
|
115
|
+
BxBuilderChain::Data.new(result)
|
116
|
+
end
|
117
|
+
|
118
|
+
def processor_klass
|
119
|
+
raise UnknownFormatError unless (kind = find_processor)
|
120
|
+
|
121
|
+
BxBuilderChain::Processors.const_get(kind)
|
122
|
+
end
|
123
|
+
|
124
|
+
def find_processor
|
125
|
+
processors.find { |klass| processor_matches? "#{klass}::#{lookup_constant}", source_type }
|
126
|
+
end
|
127
|
+
|
128
|
+
def processor_matches?(constant, value)
|
129
|
+
BxBuilderChain::Processors.const_get(constant).include?(value)
|
130
|
+
end
|
131
|
+
|
132
|
+
def processors
|
133
|
+
BxBuilderChain::Processors.constants
|
134
|
+
end
|
135
|
+
|
136
|
+
def source_type
|
137
|
+
url? ? @raw_data.content_type : File.extname(@path)
|
138
|
+
end
|
139
|
+
|
140
|
+
def lookup_constant
|
141
|
+
url? ? :CONTENT_TYPES : :EXTENSIONS
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
# Processors load and parse/process various data types such as CSVs, PDFs, Word documents, HTML pages, and others.
|
6
|
+
class Base
|
7
|
+
include BxBuilderChain::DependencyHelper
|
8
|
+
|
9
|
+
EXTENSIONS = []
|
10
|
+
CONTENT_TYPES = []
|
11
|
+
|
12
|
+
def initialize(options = {})
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse(data)
|
17
|
+
raise NotImplementedError
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module BxBuilderChain
|
6
|
+
module Processors
|
7
|
+
class Csv < Base
|
8
|
+
EXTENSIONS = [".csv"]
|
9
|
+
CONTENT_TYPES = ["text/csv"]
|
10
|
+
|
11
|
+
# Parse the document and return the text
|
12
|
+
# @param [File] data
|
13
|
+
# @return [Array of Hash]
|
14
|
+
def parse(data)
|
15
|
+
::CSV.new(data.read, col_sep: separator).map do |row|
|
16
|
+
row.map(&:strip)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def separator
|
23
|
+
@options[:col_sep] || ","
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
class Docx < Base
|
6
|
+
EXTENSIONS = [".docx"]
|
7
|
+
CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
8
|
+
|
9
|
+
def initialize(*)
|
10
|
+
depends_on "docx"
|
11
|
+
require "docx"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse the document and return the text
|
15
|
+
# @param [File] data
|
16
|
+
# @return [String]
|
17
|
+
def parse(data)
|
18
|
+
::Docx::Document
|
19
|
+
.open(StringIO.new(data.read))
|
20
|
+
.text
|
21
|
+
.strip
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
class Html < Base
|
6
|
+
EXTENSIONS = [".html", ".htm"]
|
7
|
+
CONTENT_TYPES = ["text/html"]
|
8
|
+
|
9
|
+
# We only look for headings and paragraphs
|
10
|
+
TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
|
11
|
+
|
12
|
+
def initialize(*)
|
13
|
+
depends_on "nokogiri"
|
14
|
+
require "nokogiri"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse the document and return the text
|
18
|
+
# @param [File] data
|
19
|
+
# @return [String]
|
20
|
+
def parse(data)
|
21
|
+
Nokogiri::HTML(data.read)
|
22
|
+
.css(TEXT_CONTENT_TAGS.join(","))
|
23
|
+
.map(&:inner_text)
|
24
|
+
.join("\n\n")
|
25
|
+
.strip
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
class Json < Base
|
6
|
+
EXTENSIONS = [".json"]
|
7
|
+
CONTENT_TYPES = ["application/json"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [Hash]
|
12
|
+
def parse(data)
|
13
|
+
::JSON.parse(data.read)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
class Pdf < Base
|
6
|
+
EXTENSIONS = [".pdf"]
|
7
|
+
CONTENT_TYPES = ["application/pdf"]
|
8
|
+
|
9
|
+
def initialize(*)
|
10
|
+
depends_on "pdf-reader"
|
11
|
+
require "pdf-reader"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse the document and return the text
|
15
|
+
# @param [File] data
|
16
|
+
# @return [String]
|
17
|
+
def parse(data)
|
18
|
+
::PDF::Reader
|
19
|
+
.new(StringIO.new(data.read))
|
20
|
+
.pages
|
21
|
+
.map(&:text)
|
22
|
+
.join("\n\n")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
class Text < Base
|
6
|
+
EXTENSIONS = [".txt"]
|
7
|
+
CONTENT_TYPES = ["text/plain"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [String]
|
12
|
+
def parse(data)
|
13
|
+
data.read
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Processors
|
5
|
+
class Xlsx < Base
|
6
|
+
EXTENSIONS = [".xlsx", ".xlsm"].freeze
|
7
|
+
CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"].freeze
|
8
|
+
|
9
|
+
def initialize(*)
|
10
|
+
depends_on "roo"
|
11
|
+
require "roo"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse the document and return the text
|
15
|
+
# @param [File] data
|
16
|
+
# @return [Array<Array<String>>] Array of rows, each row is an array of cells
|
17
|
+
def parse(data)
|
18
|
+
output = []
|
19
|
+
|
20
|
+
xlsx_file = Roo::Spreadsheet.open(data)
|
21
|
+
xlsx_file.each_with_pagename do |sheet, rows|
|
22
|
+
output << rows.map do |row|
|
23
|
+
row.map { |i| i.to_s.strip }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
output
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|