bx_builder_chain 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +13 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +22 -0
  6. data/Gemfile.lock +120 -0
  7. data/README.md +74 -0
  8. data/Rakefile +12 -0
  9. data/bx_builder_chain.gemspec +35 -0
  10. data/lib/bx_builder_chain/chunker/recursive_text.rb +38 -0
  11. data/lib/bx_builder_chain/chunker/text.rb +38 -0
  12. data/lib/bx_builder_chain/configuration.rb +21 -0
  13. data/lib/bx_builder_chain/data.rb +28 -0
  14. data/lib/bx_builder_chain/dependency_helper.rb +22 -0
  15. data/lib/bx_builder_chain/llm/base.rb +64 -0
  16. data/lib/bx_builder_chain/llm/open_ai.rb +191 -0
  17. data/lib/bx_builder_chain/loader.rb +144 -0
  18. data/lib/bx_builder_chain/processors/base.rb +21 -0
  19. data/lib/bx_builder_chain/processors/csv.rb +27 -0
  20. data/lib/bx_builder_chain/processors/docx.rb +25 -0
  21. data/lib/bx_builder_chain/processors/html.rb +29 -0
  22. data/lib/bx_builder_chain/processors/json.rb +17 -0
  23. data/lib/bx_builder_chain/processors/pdf.rb +26 -0
  24. data/lib/bx_builder_chain/processors/text.rb +17 -0
  25. data/lib/bx_builder_chain/processors/xlsx.rb +31 -0
  26. data/lib/bx_builder_chain/utils/token_data/cl100k_base.tiktoken +100256 -0
  27. data/lib/bx_builder_chain/utils/token_length/base_validator.rb +45 -0
  28. data/lib/bx_builder_chain/utils/token_length/open_ai_validator.rb +70 -0
  29. data/lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb +72 -0
  30. data/lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb +44 -0
  31. data/lib/bx_builder_chain/vectorsearch/base.rb +160 -0
  32. data/lib/bx_builder_chain/vectorsearch/pgvector.rb +228 -0
  33. data/lib/bx_builder_chain/version.rb +5 -0
  34. data/lib/bx_builder_chain.rb +38 -0
  35. data/lib/generators/bx_builder_chain/install_generator.rb +42 -0
  36. data/lib/generators/bx_builder_chain/templates/app/admin/bx_builder_chain_document.rb +65 -0
  37. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/documents_controller.rb +65 -0
  38. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/questions_controller.rb +33 -0
  39. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/test_controller.rb +10 -0
  40. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document.rb +26 -0
  41. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document_chunk.rb +9 -0
  42. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/embedding.rb +9 -0
  43. data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/document_upload_service.rb +47 -0
  44. data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/question_asking_service.rb +35 -0
  45. data/lib/generators/bx_builder_chain/templates/app/views/bx_builder_chain/test/form.html.erb +164 -0
  46. data/lib/generators/bx_builder_chain/templates/app/workers/bx_builder_chain/document_processor_worker.rb +32 -0
  47. data/lib/generators/bx_builder_chain/templates/initializer.rb +12 -0
  48. data/lib/generators/bx_builder_chain/templates/migration.rb +33 -0
  49. data/lib/pgvector/pg/binary_decoder/vector.rb +14 -0
  50. data/lib/pgvector/pg/text_decoder/vector.rb +12 -0
  51. data/lib/pgvector/pg.rb +10 -0
  52. data/lib/pgvector.rb +11 -0
  53. data/lib/sequel/plugins/pgvector/class_methods.rb +47 -0
  54. data/lib/sequel/plugins/pgvector/instance_methods.rb +34 -0
  55. data/lib/sequel/plugins/pgvector.rb +12 -0
  56. data/sig/bx_langchain_chat.rbs +4 -0
  57. metadata +238 -0
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Utils
5
+ module TokenLength
6
+ #
7
+ # Calculate the `max_tokens:` parameter to be set by calculating the context length of the text minus the prompt length
8
+ #
9
+ # @param content [String | Array<String>] The text or array of texts to validate
10
+ # @param model_name [String] The model name to validate against
11
+ # @return [Integer] Whether the text is valid or not
12
+ # @raise [TokenLimitExceeded] If the text is too long
13
+ #
14
+ class BaseValidator
15
+ def self.validate_max_tokens!(content, model_name, options = {})
16
+ text_token_length = if content.is_a?(Array)
17
+ content.sum { |item| token_length(item.to_json, model_name, options) }
18
+ else
19
+ token_length(content, model_name, options)
20
+ end
21
+
22
+ leftover_tokens = token_limit(model_name) - text_token_length
23
+
24
+ if leftover_tokens < 0
25
+ raise "This model's maximum context length is #{token_limit(model_name)} tokens, but the given text is #{text_token_length} tokens long."
26
+ end
27
+
28
+ leftover_tokens
29
+ end
30
+
31
+ def self.token_length(text, model_name = "", options = {})
32
+ start_time = Time.new.to_f * 1000
33
+ settings = BxBuilderChain::Utils::Tokenization::OpenAiEncodings.cl100k_base
34
+ puts "loading 100k_base = #{(Time.new.to_f * 1000) - start_time}"
35
+ encoder = BxBuilderChain::Utils::Tokenization::BytePairEncoding.new(
36
+ pat_str: settings["pat_str"],
37
+ mergeable_ranks: settings["mergeable_ranks"]
38
+ )
39
+
40
+ encoder.encode(text).count
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ module Utils
5
+ module TokenLength
6
+ #
7
+ # This class is meant to validate the length of the text passed in to OpenAI's API.
8
+ # It is used to validate the token length before the API call is made
9
+ #
10
+ class OpenAiValidator < BaseValidator
11
+ TOKEN_LIMITS = {
12
+ # Source:
13
+ # https://platform.openai.com/docs/api-reference/embeddings
14
+ # https://platform.openai.com/docs/models/gpt-4
15
+ "text-embedding-ada-002" => 8191,
16
+ "gpt-3.5-turbo" => 4096,
17
+ "gpt-3.5-turbo-0301" => 4096,
18
+ "gpt-3.5-turbo-0613" => 4096,
19
+ "gpt-3.5-turbo-16k" => 16384,
20
+ "gpt-3.5-turbo-16k-0613" => 16384,
21
+ "text-davinci-003" => 4097,
22
+ "text-davinci-002" => 4097,
23
+ "code-davinci-002" => 8001,
24
+ "gpt-4" => 8192,
25
+ "gpt-4-0314" => 8192,
26
+ "gpt-4-0613" => 8192,
27
+ "gpt-4-32k" => 32768,
28
+ "gpt-4-32k-0314" => 32768,
29
+ "gpt-4-32k-0613" => 32768,
30
+ "text-curie-001" => 2049,
31
+ "text-babbage-001" => 2049,
32
+ "text-ada-001" => 2049,
33
+ "davinci" => 2049,
34
+ "curie" => 2049,
35
+ "babbage" => 2049,
36
+ "ada" => 2049
37
+ }.freeze
38
+
39
+ def self.token_limit(model_name)
40
+ TOKEN_LIMITS[model_name]
41
+ end
42
+ def self.validate_max_tokens!(content, model_name, options = {})
43
+ text_token_length = if content.is_a?(Array)
44
+ content.sum { |item| token_length(item.to_json, model_name, options) }
45
+ else
46
+ token_length(content, model_name, options)
47
+ end
48
+
49
+ leftover_tokens = token_limit(model_name) - text_token_length
50
+
51
+ if leftover_tokens < 0
52
+ raise "This model's maximum context length is #{token_limit(model_name)} tokens, but the given text is #{text_token_length} tokens long."
53
+ end
54
+
55
+ leftover_tokens
56
+ end
57
+
58
+ def self.token_length(text, model_name = "", options = {})
59
+ settings = BxBuilderChain::Utils::Tokenization::OpenAiEncodings.cl100k_base
60
+ encoder = BxBuilderChain::Utils::Tokenization::BytePairEncoding.new(
61
+ pat_str: settings["pat_str"],
62
+ mergeable_ranks: settings["mergeable_ranks"]
63
+ )
64
+
65
+ encoder.encode(text).count
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,72 @@
1
+ module BxBuilderChain
2
+ module Utils
3
+ module Tokenization
4
+ class BytePairEncoding
5
+ def initialize(pat_str:, mergeable_ranks:)
6
+ @pat_str = pat_str
7
+ @mergeable_ranks = mergeable_ranks
8
+ @decoder = mergeable_ranks.invert
9
+ @pat = Regexp.new(pat_str)
10
+ @tokenized_string = []
11
+ end
12
+
13
+ def encode(text, visualise: nil)
14
+ words = text.scan(@pat)
15
+ words.flat_map { |word| bpe_encode(word.bytes, visualise: visualise) }
16
+ end
17
+
18
+ def visualise_tokenised_string
19
+ visualise_tokens_coloured(@tokenized_string) unless @tokenized_string.empty?
20
+ end
21
+
22
+ private
23
+
24
+ def bpe_encode(input, visualise: nil)
25
+ parts = input.map(&:chr)
26
+ visualiser = select_visualiser(visualise)
27
+
28
+ until (min_pair = find_min_rank_pair(parts)).nil?
29
+ visualiser.call(parts)
30
+ idx = min_pair[:index]
31
+ parts[idx, 2] = [parts[idx] + parts[idx + 1]]
32
+ end
33
+
34
+ @tokenized_string.concat(parts)
35
+ puts '' unless visualise.nil?
36
+ parts.map { |part| @mergeable_ranks[part] }
37
+ end
38
+
39
+ def find_min_rank_pair(parts)
40
+ pairs = parts.each_cons(2).with_index.map do |pair, i|
41
+ rank = @mergeable_ranks[pair.join]
42
+ { pair: pair, rank: rank, index: i } if rank
43
+ end.compact
44
+ pairs.min_by { |entry| entry[:rank] }
45
+ end
46
+
47
+ def select_visualiser(mode)
48
+ case mode
49
+ when 'colour', 'color'
50
+ method(:visualise_tokens_coloured)
51
+ when 'simple'
52
+ method(:visualise_tokens_simple)
53
+ else
54
+ ->(_tokens) {}
55
+ end
56
+ end
57
+
58
+ def visualise_tokens_coloured(token_values)
59
+ backgrounds = [167, 179, 185, 77, 80, 68, 134].cycle
60
+ output = token_values.each_with_index.map do |value, index|
61
+ "\e[48;5;#{backgrounds.next}m#{value}"
62
+ end.join
63
+ puts "#{output}\e[0m"
64
+ end
65
+
66
+ def visualise_tokens_simple(token_values)
67
+ puts token_values
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,44 @@
1
+ require 'base64'
2
+
3
+ module BxBuilderChain
4
+ module Utils
5
+ module Tokenization
6
+ class OpenAiEncodings
7
+
8
+
9
+ ENDOFTEXT = "<|endoftext|>"
10
+ FIM_PREFIX = "<|fim_prefix|>"
11
+ FIM_MIDDLE = "<|fim_middle|>"
12
+ FIM_SUFFIX = "<|fim_suffix|>"
13
+ ENDOFPROMPT = "<|endofprompt|>"
14
+
15
+ def self.load_tiktoken_bpe(tiktoken_bpe_file)
16
+ contents = File.read(tiktoken_bpe_file)
17
+ contents.split("\n").each_with_object({}) do |line, hash|
18
+ token, rank = line.split
19
+ hash[Base64.decode64(token)] = rank.to_i
20
+ end
21
+ end
22
+
23
+ def self.cl100k_base
24
+ mergeable_ranks = load_tiktoken_bpe(File.join(__dir__, '..', 'token_data', 'cl100k_base.tiktoken'))
25
+
26
+ special_tokens = {
27
+ ENDOFTEXT => 100257,
28
+ FIM_PREFIX => 100258,
29
+ FIM_MIDDLE => 100259,
30
+ FIM_SUFFIX => 100260,
31
+ ENDOFPROMPT => 100276,
32
+ }
33
+
34
+ {
35
+ "name" => "cl100k_base",
36
+ "pat_str" => /(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/,
37
+ "mergeable_ranks" => mergeable_ranks,
38
+ "special_tokens" => special_tokens
39
+ }
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain::Vectorsearch
4
+ # = Vector Databases
5
+ # A vector database a type of database that stores data as high-dimensional vectors, which are mathematical representations of features or attributes. Each vector has a certain number of dimensions, which can range from tens to thousands, depending on the complexity and granularity of the data.
6
+ #
7
+ # == Available vector databases
8
+ #
9
+ # - {BxBuilderChain::Vectorsearch::Pgvector}
10
+ #
11
+ # == Usage
12
+ #
13
+ # 1. Pick a vector database from list.
14
+ # 2. Review its documentation to install the required gems, and create an account, get an API key, etc
15
+ # 3. Instantiate the vector database class:
16
+ #
17
+ # weaviate = BxBuilderChain::Vectorsearch::Weaviate.new(
18
+ # url: ENV["WEAVIATE_URL"],
19
+ # api_key: ENV["WEAVIATE_API_KEY"],
20
+ # table_name: "Documents",
21
+ # llm: :openai, # or :cohere, :hugging_face, :google_palm, or :replicate
22
+ # llm_api_key: ENV["OPENAI_API_KEY"] # API key for the selected LLM
23
+ # )
24
+ #
25
+ # # You can instantiate other supported vector databases the same way:
26
+ # milvus = BxBuilderChain::Vectorsearch::Milvus.new(...)
27
+ # qdrant = BxBuilderChain::Vectorsearch::Qdrant.new(...)
28
+ # pinecone = BxBuilderChain::Vectorsearch::Pinecone.new(...)
29
+ # chrome = BxBuilderChain::Vectorsearch::Chroma.new(...)
30
+ # pgvector = BxBuilderChain::Vectorsearch::Pgvector.new(...)
31
+ #
32
+ # == Schema Creation
33
+ #
34
+ # `create_default_schema()` creates default schema in your vector database.
35
+ #
36
+ # search.create_default_schema
37
+ #
38
+ # (We plan on offering customizable schema creation shortly)
39
+ #
40
+ # == Adding Data
41
+ #
42
+ # You can add data with:
43
+ # 1. `add_data(path:, paths:)` to add any kind of data type
44
+ #
45
+ # my_pdf = BxBuilderChain.root.join("path/to/my.pdf")
46
+ # my_text = BxBuilderChain.root.join("path/to/my.txt")
47
+ # my_docx = BxBuilderChain.root.join("path/to/my.docx")
48
+ # my_csv = BxBuilderChain.root.join("path/to/my.csv")
49
+ #
50
+ # search.add_data(paths: [my_pdf, my_text, my_docx, my_csv])
51
+ #
52
+ # 2. `add_texts(texts:)` to only add textual data
53
+ #
54
+ # search.add_texts(
55
+ # texts: [
56
+ # "Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
57
+ # "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s"
58
+ # ]
59
+ # )
60
+ #
61
+ # == Retrieving Data
62
+ #
63
+ # `similarity_search_by_vector(embedding:, k:)` searches the vector database for the closest `k` number of embeddings.
64
+ #
65
+ # search.similarity_search_by_vector(
66
+ # embedding: ...,
67
+ # k: # number of results to be retrieved
68
+ # )
69
+ #
70
+ # `vector_store.similarity_search(query:, k:)` generates an embedding for the query and searches the vector database for the closest `k` number of embeddings.
71
+ #
72
+ # search.similarity_search_by_vector(
73
+ # embedding: ...,
74
+ # k: # number of results to be retrieved
75
+ # )
76
+ #
77
+ # `ask(question:)` generates an embedding for the passed-in question, searches the vector database for closest embeddings and then passes these as context to the LLM to generate an answer to the question.
78
+ #
79
+ # search.ask(question: "What is lorem ipsum?")
80
+ #
81
+ class Base
82
+ include BxBuilderChain::DependencyHelper
83
+
84
+ attr_reader :client, :table_name, :llm
85
+
86
+ DEFAULT_METRIC = "cosine"
87
+
88
+ # @param llm [Object] The LLM client to use
89
+ def initialize(llm:)
90
+ @llm = llm
91
+ end
92
+
93
+ # Method supported by Vectorsearch DB to retrieve a default schema
94
+ def get_default_schema
95
+ raise NotImplementedError, "#{self.class.name} does not support retrieving a default schema"
96
+ end
97
+
98
+ # Method supported by Vectorsearch DB to create a default schema
99
+ def create_default_schema
100
+ raise NotImplementedError, "#{self.class.name} does not support creating a default schema"
101
+ end
102
+
103
+ # Method supported by Vectorsearch DB to delete the default schema
104
+ def destroy_default_schema
105
+ raise NotImplementedError, "#{self.class.name} does not support deleting a default schema"
106
+ end
107
+
108
+ # Method supported by Vectorsearch DB to add a list of texts to the index
109
+ def add_texts(**kwargs)
110
+ raise NotImplementedError, "#{self.class.name} does not support adding texts"
111
+ end
112
+
113
+ # Method supported by Vectorsearch DB to update a list of texts to the index
114
+ def update_texts(**kwargs)
115
+ raise NotImplementedError, "#{self.class.name} does not support updating texts"
116
+ end
117
+
118
+ # Method supported by Vectorsearch DB to search for similar texts in the index
119
+ def similarity_search(**kwargs)
120
+ raise NotImplementedError, "#{self.class.name} does not support similarity search"
121
+ end
122
+
123
+ # Method supported by Vectorsearch DB to search for similar texts in the index by the passed in vector.
124
+ # You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.
125
+ def similarity_search_by_vector(**kwargs)
126
+ raise NotImplementedError, "#{self.class.name} does not support similarity search by vector"
127
+ end
128
+
129
+ # Method supported by Vectorsearch DB to answer a question given a context (data) pulled from your Vectorsearch DB.
130
+ def ask(**kwargs)
131
+ raise NotImplementedError, "#{self.class.name} does not support asking questions"
132
+ end
133
+
134
+ def generate_prompt(question:, context:, prompt_template: nil)
135
+ template = prompt_template || BxBuilderChain.configuration.default_prompt_template
136
+ template % {context: context, question: question}
137
+ end
138
+
139
+ def add_data(paths:)
140
+ raise ArgumentError, "Paths must be provided" if Array(paths).empty?
141
+
142
+ texts = Array(paths)
143
+ .flatten
144
+ .map do |path|
145
+ data = BxBuilderChain::Loader.new(path)&.load&.chunks
146
+ data.map { |chunk| chunk[:text] }
147
+ end
148
+
149
+ texts.flatten!
150
+
151
+ add_texts(texts: texts)
152
+ end
153
+
154
+ def self.logger_options
155
+ {
156
+ color: :blue
157
+ }
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain::Vectorsearch
4
+ class Pgvector < Base
5
+ #
6
+ # The PostgreSQL vector search adapter
7
+ #
8
+ # Usage:
9
+ # pgvector = Langchain::Vectorsearch::Pgvector.new(url:, table_name:, llm:, namespace: nil)
10
+ #
11
+
12
+ # The operators supported by the PostgreSQL vector search adapter
13
+ OPERATORS = {
14
+ "cosine_distance" => "cosine",
15
+ "euclidean_distance" => "euclidean"
16
+ }
17
+ DEFAULT_OPERATOR = "cosine_distance"
18
+
19
+ attr_reader :db, :operator, :table_name, :namespace_column, :namespaces, :documents_table
20
+
21
+ # @param url [String] The URL of the PostgreSQL database
22
+ # @param table_name [String] The name of the table to use for the index
23
+ # @param llm [Object] The LLM client to use
24
+ # @param namespace [String] The namespace to use for the index when inserting/querying
25
+ def initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public'])
26
+
27
+ depends_on "sequel"
28
+ require "sequel"
29
+
30
+ @db = Sequel.connect(BxBuilderChain.configuration.pg_url)
31
+ @table_name = "bx_builder_chain_embeddings"
32
+ @namespace_column = "namespace"
33
+ set_namespaces(namespaces)
34
+ @threshold = BxBuilderChain.configuration.threshold
35
+
36
+ validate_threshold(@threshold)
37
+
38
+ @operator = OPERATORS[DEFAULT_OPERATOR]
39
+
40
+ super(llm: llm)
41
+ end
42
+
43
+ def documents_model
44
+ Class.new(Sequel::Model(@table_name.to_sym)) do
45
+ plugin :pgvector, :vectors
46
+ end
47
+ end
48
+
49
+ # Upsert a list of texts to the index
50
+ # @param texts [Array<String>] The texts to add to the index
51
+ # @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
52
+ # @return [PG::Result] The response from the database including the ids of
53
+ # the added or updated texts.
54
+ def upsert_texts(texts:, ids:)
55
+ data = texts.zip(ids).flat_map do |(text, id)|
56
+ {id: id, content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
57
+ end
58
+ # @db[table_name.to_sym].multi_insert(data, return: :primary_key)
59
+ @db[@table_name.to_sym]
60
+ .insert_conflict(
61
+ target: :id,
62
+ update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
63
+ )
64
+ .multi_insert(data, return: :primary_key)
65
+ end
66
+
67
+ # Add a list of texts to the index
68
+ # @param texts [Array<String>] The texts to add to the index
69
+ # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
70
+ # @return [Array<Integer>] The the ids of the added texts.
71
+ def add_texts(texts:, ids: nil)
72
+ if ids.nil? || ids.empty?
73
+ data = texts.map do |text|
74
+ {content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
75
+ end
76
+
77
+ @db[@table_name.to_sym].multi_insert(data, return: :primary_key)
78
+ else
79
+ upsert_texts(texts: texts, ids: ids)
80
+ end
81
+ end
82
+
83
+ # Update a list of ids and corresponding texts to the index
84
+ # @param texts [Array<String>] The texts to add to the index
85
+ # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
86
+ # @return [Array<Integer>] The ids of the updated texts.
87
+ def update_texts(texts:, ids:)
88
+ upsert_texts(texts: texts, ids: ids)
89
+ end
90
+
91
+ def create_default_schema
92
+ db.run "CREATE EXTENSION IF NOT EXISTS vector"
93
+
94
+ namespace_column = @namespace_column
95
+ vector_dimension = llm.default_dimension || 1000
96
+
97
+ # bx_builder_chain_embeddings table
98
+ db.create_table? :bx_builder_chain_embeddings do
99
+ primary_key :id
100
+ text :content
101
+ column :vectors, "vector(#{vector_dimension})"
102
+ text namespace_column.to_sym, default: 'public'
103
+
104
+ index namespace_column.to_sym
105
+ end
106
+
107
+ # bx_builder_chain_documents table
108
+ db.create_table? :bx_builder_chain_documents do
109
+ primary_key :id
110
+ text :name
111
+ text namespace_column.to_sym, default: 'public'
112
+ timestamp :created_at
113
+ timestamp :updated_at
114
+
115
+ index [:name, namespace_column.to_sym], unique: true
116
+ end
117
+
118
+ # bx_builder_chain_document_chunks table
119
+ db.create_table? :bx_builder_chain_document_chunks do
120
+ primary_key :id
121
+ foreign_key :document_id, :bx_builder_chain_documents, null: false, on_delete: :cascade
122
+ foreign_key :embedding_id, :bx_builder_chain_embeddings, null: false, on_delete: :cascade
123
+
124
+ unique [:document_id, :embedding_id]
125
+ end
126
+ end
127
+
128
+
129
+ # Destroy default schema
130
+ def destroy_default_schema
131
+ db.drop_table? :bx_builder_chain_document_chunks
132
+ db.drop_table? :bx_builder_chain_documents
133
+ db.drop_table? :bx_builder_chain_embeddings
134
+ end
135
+
136
+ # Search for similar texts in the index
137
+ # @param query [String] The text to search for
138
+ # @param k [Integer] The number of top results to return
139
+ # @return [Array<Hash>] The results of the search
140
+ def similarity_search(query:, k: 4)
141
+ embedding = llm.embed(text: query)
142
+
143
+ similarity_search_by_vector(
144
+ embedding: embedding,
145
+ k: k
146
+ )
147
+ end
148
+
149
+ # Search for similar texts in the index by the passed in vector.
150
+ # You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.
151
+ # @param embedding [Array<Float>] The vector to search for
152
+ # @param k [Integer] The number of top results to return
153
+ # @return [Array<Hash>] The results of the search
154
+ def similarity_search_by_vector(embedding:, k: 4)
155
+ db.transaction do # BEGIN
156
+ documents_model
157
+ .nearest_neighbors(:vectors, embedding, distance: operator, threshold: @threshold)
158
+ .where(@namespace_column.to_sym => namespaces)
159
+ .limit(k)
160
+ end
161
+ end
162
+
163
+ # Ask a question and return the answer
164
+ # @param question [String] The question to ask
165
+ # @return [String] The answer to the question
166
+ def ask(question:, context_results: 4, prompt_template: nil)
167
+ search_results = similarity_search(query: question, k: context_results)
168
+
169
+ context = search_results.map do |result|
170
+ result.content.to_s
171
+ end
172
+ context = context.join("\n---\n")
173
+
174
+ prompt = generate_prompt(question: question, context: context, prompt_template: nil)
175
+
176
+ llm.chat(prompt: prompt)
177
+ end
178
+
179
+ def add_data(paths:)
180
+ raise ArgumentError, "Paths must be provided" if Array(paths).empty?
181
+
182
+ all_added_chunk_ids = []
183
+
184
+ @db.transaction do # Start the transaction
185
+ paths.each do |file_n_path|
186
+ path, file = extract_path_and_file(file_n_path)
187
+
188
+ texts = BxBuilderChain::Loader.new(path)&.load&.chunks.map { |chunk| chunk[:text] }
189
+
190
+ texts.flatten!
191
+
192
+ added_chunk_ids_for_current_path = add_texts(texts: texts)
193
+
194
+ all_added_chunk_ids.concat(added_chunk_ids_for_current_path)
195
+
196
+ document_record_id = @db[:bx_builder_chain_documents].insert(
197
+ name: file,
198
+ namespace: namespaces[0],
199
+ created_at: Time.now.utc,
200
+ updated_at: Time.now.utc
201
+ )
202
+
203
+ document_chunks_data = added_chunk_ids_for_current_path.map do |chunk_id|
204
+ {document_id: document_record_id, embedding_id: chunk_id}
205
+ end
206
+ @db[:bx_builder_chain_document_chunks].multi_insert(document_chunks_data)
207
+ end
208
+ end # End the transaction
209
+
210
+ all_added_chunk_ids
211
+ end
212
+
213
+ private
214
+
215
+ def extract_path_and_file(entry)
216
+ entry.is_a?(Hash) ? [entry[:path], entry[:filename]] : [entry, entry]
217
+ end
218
+
219
+ def validate_threshold(threshold)
220
+ raise "Threshold must be between 0.0 and 2.0 (0.0 being a perfect match) or nil (ignore threshold)" if !threshold.nil? && (threshold > 2 || threshold < 0)
221
+ end
222
+
223
+ def set_namespaces(namespaces)
224
+ namespaces = [namespaces] unless namespaces.is_a?(Array)
225
+ @namespaces = namespaces.push(BxBuilderChain.configuration.public_namespace).uniq
226
+ end
227
+ end
228
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BxBuilderChain
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "logger"
4
+ require 'dotenv'
5
+ require "zeitwerk"
6
+
7
+ Dotenv.load
8
+ loader = Zeitwerk::Loader.for_gem(warn_on_extra_files: false)
9
+ loader.ignore("#{__dir__}/bx_builder_chain.rb")
10
+ loader.setup
11
+
12
+ module BxBuilderChain
13
+ class << self
14
+ attr_reader :logger
15
+ attr_writer :configuration
16
+
17
+ def logger=(logger)
18
+ @logger ||= logger
19
+ end
20
+
21
+ def configuration
22
+ @configuration ||= Configuration.new
23
+ end
24
+
25
+ def configure
26
+ yield(configuration)
27
+ end
28
+
29
+ def reset_config
30
+ @configuration = Configuration.new
31
+ end
32
+ end
33
+
34
+ self.logger = ::Logger.new($stdout, level: :warn)
35
+
36
+ class Error < StandardError; end
37
+
38
+ end