bx_builder_chain 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +13 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +22 -0
  6. data/Gemfile.lock +120 -0
  7. data/README.md +74 -0
  8. data/Rakefile +12 -0
  9. data/bx_builder_chain.gemspec +35 -0
  10. data/lib/bx_builder_chain/chunker/recursive_text.rb +38 -0
  11. data/lib/bx_builder_chain/chunker/text.rb +38 -0
  12. data/lib/bx_builder_chain/configuration.rb +21 -0
  13. data/lib/bx_builder_chain/data.rb +28 -0
  14. data/lib/bx_builder_chain/dependency_helper.rb +22 -0
  15. data/lib/bx_builder_chain/llm/base.rb +64 -0
  16. data/lib/bx_builder_chain/llm/open_ai.rb +191 -0
  17. data/lib/bx_builder_chain/loader.rb +144 -0
  18. data/lib/bx_builder_chain/processors/base.rb +21 -0
  19. data/lib/bx_builder_chain/processors/csv.rb +27 -0
  20. data/lib/bx_builder_chain/processors/docx.rb +25 -0
  21. data/lib/bx_builder_chain/processors/html.rb +29 -0
  22. data/lib/bx_builder_chain/processors/json.rb +17 -0
  23. data/lib/bx_builder_chain/processors/pdf.rb +26 -0
  24. data/lib/bx_builder_chain/processors/text.rb +17 -0
  25. data/lib/bx_builder_chain/processors/xlsx.rb +31 -0
  26. data/lib/bx_builder_chain/utils/token_data/cl100k_base.tiktoken +100256 -0
  27. data/lib/bx_builder_chain/utils/token_length/base_validator.rb +45 -0
  28. data/lib/bx_builder_chain/utils/token_length/open_ai_validator.rb +70 -0
  29. data/lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb +72 -0
  30. data/lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb +44 -0
  31. data/lib/bx_builder_chain/vectorsearch/base.rb +160 -0
  32. data/lib/bx_builder_chain/vectorsearch/pgvector.rb +228 -0
  33. data/lib/bx_builder_chain/version.rb +5 -0
  34. data/lib/bx_builder_chain.rb +38 -0
  35. data/lib/generators/bx_builder_chain/install_generator.rb +42 -0
  36. data/lib/generators/bx_builder_chain/templates/app/admin/bx_builder_chain_document.rb +65 -0
  37. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/documents_controller.rb +65 -0
  38. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/questions_controller.rb +33 -0
  39. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/test_controller.rb +10 -0
  40. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document.rb +26 -0
  41. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document_chunk.rb +9 -0
  42. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/embedding.rb +9 -0
  43. data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/document_upload_service.rb +47 -0
  44. data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/question_asking_service.rb +35 -0
  45. data/lib/generators/bx_builder_chain/templates/app/views/bx_builder_chain/test/form.html.erb +164 -0
  46. data/lib/generators/bx_builder_chain/templates/app/workers/bx_builder_chain/document_processor_worker.rb +32 -0
  47. data/lib/generators/bx_builder_chain/templates/initializer.rb +12 -0
  48. data/lib/generators/bx_builder_chain/templates/migration.rb +33 -0
  49. data/lib/pgvector/pg/binary_decoder/vector.rb +14 -0
  50. data/lib/pgvector/pg/text_decoder/vector.rb +12 -0
  51. data/lib/pgvector/pg.rb +10 -0
  52. data/lib/pgvector.rb +11 -0
  53. data/lib/sequel/plugins/pgvector/class_methods.rb +47 -0
  54. data/lib/sequel/plugins/pgvector/instance_methods.rb +34 -0
  55. data/lib/sequel/plugins/pgvector.rb +12 -0
  56. data/sig/bx_langchain_chat.rbs +4 -0
  57. metadata +238 -0
@@ -0,0 +1,191 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain::Llm
4
+ # LLM interface for OpenAI APIs: https://platform.openai.com/overview
5
+ #
6
+ # Gem requirements:
7
+ # gem "ruby-openai", "~> 4.0.0"
8
+ #
9
+ # Usage:
10
+ # openai = BxBuilderChain::LLM::OpenAI.new(api_key:, llm_options: {})
11
+ #
12
+ class OpenAi < Base
13
+ DEFAULTS = {
14
+ temperature: 0.0,
15
+ completion_model_name: "text-davinci-003",
16
+ chat_completion_model_name: "gpt-3.5-turbo",
17
+ embeddings_model_name: "text-embedding-ada-002",
18
+ dimension: 1536
19
+ }.freeze
20
+ LENGTH_VALIDATOR = BxBuilderChain::Utils::TokenLength::OpenAiValidator
21
+ ROLE_MAPPING = {
22
+ "ai" => "assistant",
23
+ "human" => "user"
24
+ }
25
+
26
+ attr_accessor :functions
27
+
28
+ def initialize(api_key: BxBuilderChain.configuration.openai_api_key, llm_options: {}, default_options: {})
29
+ depends_on "ruby-openai"
30
+ require "openai"
31
+
32
+ @client = ::OpenAI::Client.new(access_token: api_key, **llm_options)
33
+ @defaults = DEFAULTS.merge(default_options)
34
+ end
35
+
36
+ #
37
+ # Generate an embedding for a given text
38
+ #
39
+ # @param text [String] The text to generate an embedding for
40
+ # @param params extra parameters passed to OpenAI::Client#embeddings
41
+ # @return [Array] The embedding
42
+ #
43
+ def embed(text:, **params)
44
+ parameters = {model: @defaults[:embeddings_model_name], input: text}
45
+
46
+ validate_max_tokens(text, parameters[:model])
47
+
48
+ response = client.embeddings(parameters: parameters.merge(params))
49
+ response.dig("data").first.dig("embedding")
50
+ end
51
+
52
+ #
53
+ # Generate a completion for a given prompt
54
+ #
55
+ # @param prompt [String] The prompt to generate a completion for
56
+ # @param params extra parameters passed to OpenAI::Client#complete
57
+ # @return [String] The completion
58
+ #
59
+ def complete(prompt:, **params)
60
+ parameters = compose_parameters @defaults[:completion_model_name], params
61
+
62
+ parameters[:prompt] = prompt
63
+ parameters[:max_tokens] = validate_max_tokens(prompt, parameters[:model])
64
+
65
+ response = client.completions(parameters: parameters)
66
+ response.dig("choices", 0, "text")
67
+ end
68
+
69
+ #
70
+ # Generate a chat completion for a given prompt or messages.
71
+ #
72
+ # == Examples
73
+ #
74
+ # # simplest case, just give a prompt
75
+ # openai.chat prompt: "When was Ruby first released?"
76
+ #
77
+ # # prompt plus some context about how to respond
78
+ # openai.chat context: "You are RubyGPT, a helpful chat bot for helping people learn Ruby", prompt: "Does Ruby have a REPL like IPython?"
79
+ #
80
+ # # full control over messages that get sent, equivilent to the above
81
+ # openai.chat messages: [
82
+ # {
83
+ # role: "system",
84
+ # content: "You are RubyGPT, a helpful chat bot for helping people learn Ruby", prompt: "Does Ruby have a REPL like IPython?"
85
+ # },
86
+ # {
87
+ # role: "user",
88
+ # content: "When was Ruby first released?"
89
+ # }
90
+ # ]
91
+ #
92
+ # # few-short prompting with examples
93
+ # openai.chat prompt: "When was factory_bot released?",
94
+ # examples: [
95
+ # {
96
+ # role: "user",
97
+ # content: "When was Ruby on Rails released?"
98
+ # }
99
+ # {
100
+ # role: "assistant",
101
+ # content: "2004"
102
+ # },
103
+ # ]
104
+ #
105
+ # @param prompt [HumanMessage] The prompt to generate a chat completion for
106
+ # @param messages [Array<AIMessage|HumanMessage>] The messages that have been sent in the conversation
107
+ # @param context [SystemMessage] An initial context to provide as a system message, ie "You are RubyGPT, a helpful chat bot for helping people learn Ruby"
108
+ # @param examples [Array<AIMessage|HumanMessage>] Examples of messages to provide to the model. Useful for Few-Shot Prompting
109
+ # @param options [Hash] extra parameters passed to OpenAI::Client#chat
110
+ # @yield [AIMessage] Stream responses back one String at a time
111
+ # @return [AIMessage] The chat completion
112
+ #
113
+ def chat(prompt: "", messages: [], context: "", examples: [], **options)
114
+ raise ArgumentError.new(":prompt or :messages argument is expected") if prompt.empty? && messages.empty?
115
+
116
+ parameters = compose_parameters @defaults[:chat_completion_model_name], options
117
+ parameters[:messages] = compose_chat_messages(prompt: prompt, messages: messages, context: context, examples: examples)
118
+
119
+ if functions
120
+ parameters[:functions] = functions
121
+ else
122
+ parameters[:max_tokens] = validate_max_tokens(parameters[:messages], parameters[:model])
123
+ end
124
+
125
+ response = client.chat(parameters: parameters)
126
+
127
+ response.dig("choices", 0, "message", "content")
128
+ end
129
+
130
+ #
131
+ # Generate a summary for a given text
132
+ #
133
+ # @param text [String] The text to generate a summary for
134
+ # @return [String] The summary
135
+ #
136
+ # def summarize(text:)
137
+ # prompt_template = BxBuilderChain::Prompt.load_from_path(
138
+ # file_path: BxBuilderChain.root.join("langchain/llm/prompts/summarize_template.yaml")
139
+ # )
140
+ # prompt = prompt_template.format(text: text)
141
+
142
+ # complete(prompt: prompt, temperature: @defaults[:temperature])
143
+ # end
144
+
145
+ private
146
+
147
+ def compose_parameters(model, params)
148
+ default_params = {model: model, temperature: @defaults[:temperature]}
149
+
150
+ default_params[:stop] = params.delete(:stop_sequences) if params[:stop_sequences]
151
+
152
+ default_params.merge(params)
153
+ end
154
+
155
+ def compose_chat_messages(prompt:, messages:, context:, examples:)
156
+ history = []
157
+
158
+ history.concat transform_messages(examples) unless examples.empty?
159
+
160
+ history.concat transform_messages(messages) unless messages.empty?
161
+
162
+ unless context.nil? || context.to_s.empty?
163
+ history.reject! { |message| message[:role] == "system" }
164
+ history.prepend({role: "system", content: context.content})
165
+ end
166
+
167
+ unless prompt.empty?
168
+ if history.last && history.last[:role] == "user"
169
+ history.last[:content] += "\n#{prompt}"
170
+ else
171
+ history.append({role: "user", content: prompt})
172
+ end
173
+ end
174
+
175
+ history
176
+ end
177
+
178
+ def transform_messages(messages)
179
+ messages.map do |message|
180
+ {
181
+ role: ROLE_MAPPING.fetch(message.type, message.type),
182
+ content: message.content
183
+ }
184
+ end
185
+ end
186
+
187
+ def validate_max_tokens(messages, model)
188
+ LENGTH_VALIDATOR.validate_max_tokens!(messages, model)
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open-uri"
4
+
5
+ module BxBuilderChain
6
+ class Loader
7
+ class FileNotFound < StandardError; end
8
+
9
+ class UnknownFormatError < StandardError; end
10
+
11
+ URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
12
+
13
+ # Load data from a file or URL. Shorthand for `BxBuilderChain::Loader.new(path).load`
14
+ #
15
+ # == Examples
16
+ #
17
+ # # load a URL
18
+ # data = BxBuilderChain::Loader.load("https://example.com/docs/README.md")
19
+ #
20
+ # # load a file
21
+ # data = BxBuilderChain::Loader.load("README.md")
22
+ #
23
+ # # Load data using a custom processor
24
+ # data = BxBuilderChain::Loader.load("README.md") do |raw_data, options|
25
+ # # your processing code goes here
26
+ # # return data at the end here
27
+ # end
28
+ #
29
+ # @param path [String | Pathname] path to file or URL
30
+ # @param options [Hash] options passed to the processor class used to process the data
31
+ # @return [Data] data loaded from path
32
+ def self.load(path, options = {}, &block)
33
+ new(path, options).load(&block)
34
+ end
35
+
36
+ # Initialize BxBuilderChain::Loader
37
+ # @param path [String | Pathname] path to file or URL
38
+ # @param options [Hash] options passed to the processor class used to process the data
39
+ # @return [BxBuilderChain::Loader] loader instance
40
+ def initialize(path, options = {})
41
+ @options = options
42
+ @path = path
43
+ end
44
+
45
+ # Is the path a URL?
46
+ #
47
+ # @return [Boolean] true if path is URL
48
+ def url?
49
+ return false if @path.is_a?(Pathname)
50
+
51
+ !!(@path =~ URI_REGEX)
52
+ end
53
+
54
+ # Is the path a directory
55
+ #
56
+ # @return [Boolean] true if path is a directory
57
+ def directory?
58
+ File.directory?(@path)
59
+ end
60
+
61
+ # Load data from a file or URL
62
+ #
63
+ # loader = BxBuilderChain::Loader.new("README.md")
64
+ # # Load data using default processor for the file
65
+ # loader.load
66
+ #
67
+ # # Load data using a custom processor
68
+ # loader.load do |raw_data, options|
69
+ # # your processing code goes here
70
+ # # return data at the end here
71
+ # end
72
+ #
73
+ # @yield [String, Hash] handle parsing raw output into string directly
74
+ # @yieldparam [String] raw_data from the loaded URL or file
75
+ # @yieldreturn [String] parsed data, as a String
76
+ #
77
+ # @return [Data] data that was loaded
78
+ def load(&block)
79
+ return process_data(load_from_url, &block) if url?
80
+ return load_from_directory(&block) if directory?
81
+
82
+ process_data(load_from_path, &block)
83
+ end
84
+
85
+ private
86
+
87
+ def load_from_url
88
+ URI.parse(@path).open
89
+ end
90
+
91
+ def load_from_path
92
+ return File.open(@path) if File.exist?(@path)
93
+
94
+ raise FileNotFound, "File #{@path} does not exist"
95
+ end
96
+
97
+ def load_from_directory(&block)
98
+ Dir.glob(File.join(@path, "**/*")).map do |file|
99
+ # Only load and add to result files with supported extensions
100
+ BxBuilderChain::Loader.new(file, @options).load(&block)
101
+ rescue
102
+ UnknownFormatError nil
103
+ end.flatten.compact
104
+ end
105
+
106
+ def process_data(data, &block)
107
+ @raw_data = data
108
+
109
+ result = if block
110
+ yield @raw_data.read, @options
111
+ else
112
+ processor_klass.new(@options).parse(@raw_data)
113
+ end
114
+
115
+ BxBuilderChain::Data.new(result)
116
+ end
117
+
118
+ def processor_klass
119
+ raise UnknownFormatError unless (kind = find_processor)
120
+
121
+ BxBuilderChain::Processors.const_get(kind)
122
+ end
123
+
124
+ def find_processor
125
+ processors.find { |klass| processor_matches? "#{klass}::#{lookup_constant}", source_type }
126
+ end
127
+
128
+ def processor_matches?(constant, value)
129
+ BxBuilderChain::Processors.const_get(constant).include?(value)
130
+ end
131
+
132
+ def processors
133
+ BxBuilderChain::Processors.constants
134
+ end
135
+
136
+ def source_type
137
+ url? ? @raw_data.content_type : File.extname(@path)
138
+ end
139
+
140
+ def lookup_constant
141
+ url? ? :CONTENT_TYPES : :EXTENSIONS
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ # Processors load and parse/process various data types such as CSVs, PDFs, Word documents, HTML pages, and others.
6
+ class Base
7
+ include BxBuilderChain::DependencyHelper
8
+
9
+ EXTENSIONS = []
10
+ CONTENT_TYPES = []
11
+
12
+ def initialize(options = {})
13
+ @options = options
14
+ end
15
+
16
+ def parse(data)
17
+ raise NotImplementedError
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module BxBuilderChain
6
+ module Processors
7
+ class Csv < Base
8
+ EXTENSIONS = [".csv"]
9
+ CONTENT_TYPES = ["text/csv"]
10
+
11
+ # Parse the document and return the text
12
+ # @param [File] data
13
+ # @return [Array of Hash]
14
+ def parse(data)
15
+ ::CSV.new(data.read, col_sep: separator).map do |row|
16
+ row.map(&:strip)
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def separator
23
+ @options[:col_sep] || ","
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ class Docx < Base
6
+ EXTENSIONS = [".docx"]
7
+ CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
8
+
9
+ def initialize(*)
10
+ depends_on "docx"
11
+ require "docx"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [String]
17
+ def parse(data)
18
+ ::Docx::Document
19
+ .open(StringIO.new(data.read))
20
+ .text
21
+ .strip
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ class Html < Base
6
+ EXTENSIONS = [".html", ".htm"]
7
+ CONTENT_TYPES = ["text/html"]
8
+
9
+ # We only look for headings and paragraphs
10
+ TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
11
+
12
+ def initialize(*)
13
+ depends_on "nokogiri"
14
+ require "nokogiri"
15
+ end
16
+
17
+ # Parse the document and return the text
18
+ # @param [File] data
19
+ # @return [String]
20
+ def parse(data)
21
+ Nokogiri::HTML(data.read)
22
+ .css(TEXT_CONTENT_TAGS.join(","))
23
+ .map(&:inner_text)
24
+ .join("\n\n")
25
+ .strip
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ class Json < Base
6
+ EXTENSIONS = [".json"]
7
+ CONTENT_TYPES = ["application/json"]
8
+
9
+ # Parse the document and return the text
10
+ # @param [File] data
11
+ # @return [Hash]
12
+ def parse(data)
13
+ ::JSON.parse(data.read)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ class Pdf < Base
6
+ EXTENSIONS = [".pdf"]
7
+ CONTENT_TYPES = ["application/pdf"]
8
+
9
+ def initialize(*)
10
+ depends_on "pdf-reader"
11
+ require "pdf-reader"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [String]
17
+ def parse(data)
18
+ ::PDF::Reader
19
+ .new(StringIO.new(data.read))
20
+ .pages
21
+ .map(&:text)
22
+ .join("\n\n")
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ class Text < Base
6
+ EXTENSIONS = [".txt"]
7
+ CONTENT_TYPES = ["text/plain"]
8
+
9
+ # Parse the document and return the text
10
+ # @param [File] data
11
+ # @return [String]
12
+ def parse(data)
13
+ data.read
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Processors
5
+ class Xlsx < Base
6
+ EXTENSIONS = [".xlsx", ".xlsm"].freeze
7
+ CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"].freeze
8
+
9
+ def initialize(*)
10
+ depends_on "roo"
11
+ require "roo"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [Array<Array<String>>] Array of rows, each row is an array of cells
17
+ def parse(data)
18
+ output = []
19
+
20
+ xlsx_file = Roo::Spreadsheet.open(data)
21
+ xlsx_file.each_with_pagename do |sheet, rows|
22
+ output << rows.map do |row|
23
+ row.map { |i| i.to_s.strip }
24
+ end
25
+ end
26
+
27
+ output
28
+ end
29
+ end
30
+ end
31
+ end