bx_builder_chain 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +120 -0
- data/README.md +74 -0
- data/Rakefile +12 -0
- data/bx_builder_chain.gemspec +35 -0
- data/lib/bx_builder_chain/chunker/recursive_text.rb +38 -0
- data/lib/bx_builder_chain/chunker/text.rb +38 -0
- data/lib/bx_builder_chain/configuration.rb +21 -0
- data/lib/bx_builder_chain/data.rb +28 -0
- data/lib/bx_builder_chain/dependency_helper.rb +22 -0
- data/lib/bx_builder_chain/llm/base.rb +64 -0
- data/lib/bx_builder_chain/llm/open_ai.rb +191 -0
- data/lib/bx_builder_chain/loader.rb +144 -0
- data/lib/bx_builder_chain/processors/base.rb +21 -0
- data/lib/bx_builder_chain/processors/csv.rb +27 -0
- data/lib/bx_builder_chain/processors/docx.rb +25 -0
- data/lib/bx_builder_chain/processors/html.rb +29 -0
- data/lib/bx_builder_chain/processors/json.rb +17 -0
- data/lib/bx_builder_chain/processors/pdf.rb +26 -0
- data/lib/bx_builder_chain/processors/text.rb +17 -0
- data/lib/bx_builder_chain/processors/xlsx.rb +31 -0
- data/lib/bx_builder_chain/utils/token_data/cl100k_base.tiktoken +100256 -0
- data/lib/bx_builder_chain/utils/token_length/base_validator.rb +45 -0
- data/lib/bx_builder_chain/utils/token_length/open_ai_validator.rb +70 -0
- data/lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb +72 -0
- data/lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb +44 -0
- data/lib/bx_builder_chain/vectorsearch/base.rb +160 -0
- data/lib/bx_builder_chain/vectorsearch/pgvector.rb +228 -0
- data/lib/bx_builder_chain/version.rb +5 -0
- data/lib/bx_builder_chain.rb +38 -0
- data/lib/generators/bx_builder_chain/install_generator.rb +42 -0
- data/lib/generators/bx_builder_chain/templates/app/admin/bx_builder_chain_document.rb +65 -0
- data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/documents_controller.rb +65 -0
- data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/questions_controller.rb +33 -0
- data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/test_controller.rb +10 -0
- data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document.rb +26 -0
- data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document_chunk.rb +9 -0
- data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/embedding.rb +9 -0
- data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/document_upload_service.rb +47 -0
- data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/question_asking_service.rb +35 -0
- data/lib/generators/bx_builder_chain/templates/app/views/bx_builder_chain/test/form.html.erb +164 -0
- data/lib/generators/bx_builder_chain/templates/app/workers/bx_builder_chain/document_processor_worker.rb +32 -0
- data/lib/generators/bx_builder_chain/templates/initializer.rb +12 -0
- data/lib/generators/bx_builder_chain/templates/migration.rb +33 -0
- data/lib/pgvector/pg/binary_decoder/vector.rb +14 -0
- data/lib/pgvector/pg/text_decoder/vector.rb +12 -0
- data/lib/pgvector/pg.rb +10 -0
- data/lib/pgvector.rb +11 -0
- data/lib/sequel/plugins/pgvector/class_methods.rb +47 -0
- data/lib/sequel/plugins/pgvector/instance_methods.rb +34 -0
- data/lib/sequel/plugins/pgvector.rb +12 -0
- data/sig/bx_langchain_chat.rbs +4 -0
- metadata +238 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Utils
|
5
|
+
module TokenLength
|
6
|
+
#
|
7
|
+
# Calculate the `max_tokens:` parameter to be set by calculating the context length of the text minus the prompt length
|
8
|
+
#
|
9
|
+
# @param content [String | Array<String>] The text or array of texts to validate
|
10
|
+
# @param model_name [String] The model name to validate against
|
11
|
+
# @return [Integer] Whether the text is valid or not
|
12
|
+
# @raise [TokenLimitExceeded] If the text is too long
|
13
|
+
#
|
14
|
+
class BaseValidator
|
15
|
+
def self.validate_max_tokens!(content, model_name, options = {})
|
16
|
+
text_token_length = if content.is_a?(Array)
|
17
|
+
content.sum { |item| token_length(item.to_json, model_name, options) }
|
18
|
+
else
|
19
|
+
token_length(content, model_name, options)
|
20
|
+
end
|
21
|
+
|
22
|
+
leftover_tokens = token_limit(model_name) - text_token_length
|
23
|
+
|
24
|
+
if leftover_tokens < 0
|
25
|
+
raise "This model's maximum context length is #{token_limit(model_name)} tokens, but the given text is #{text_token_length} tokens long."
|
26
|
+
end
|
27
|
+
|
28
|
+
leftover_tokens
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.token_length(text, model_name = "", options = {})
|
32
|
+
start_time = Time.new.to_f * 1000
|
33
|
+
settings = BxBuilderChain::Utils::Tokenization::OpenAiEncodings.cl100k_base
|
34
|
+
puts "loading 100k_base = #{(Time.new.to_f * 1000) - start_time}"
|
35
|
+
encoder = BxBuilderChain::Utils::Tokenization::BytePairEncoding.new(
|
36
|
+
pat_str: settings["pat_str"],
|
37
|
+
mergeable_ranks: settings["mergeable_ranks"]
|
38
|
+
)
|
39
|
+
|
40
|
+
encoder.encode(text).count
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Utils
|
5
|
+
module TokenLength
|
6
|
+
#
|
7
|
+
# This class is meant to validate the length of the text passed in to OpenAI's API.
|
8
|
+
# It is used to validate the token length before the API call is made
|
9
|
+
#
|
10
|
+
class OpenAiValidator < BaseValidator
|
11
|
+
TOKEN_LIMITS = {
|
12
|
+
# Source:
|
13
|
+
# https://platform.openai.com/docs/api-reference/embeddings
|
14
|
+
# https://platform.openai.com/docs/models/gpt-4
|
15
|
+
"text-embedding-ada-002" => 8191,
|
16
|
+
"gpt-3.5-turbo" => 4096,
|
17
|
+
"gpt-3.5-turbo-0301" => 4096,
|
18
|
+
"gpt-3.5-turbo-0613" => 4096,
|
19
|
+
"gpt-3.5-turbo-16k" => 16384,
|
20
|
+
"gpt-3.5-turbo-16k-0613" => 16384,
|
21
|
+
"text-davinci-003" => 4097,
|
22
|
+
"text-davinci-002" => 4097,
|
23
|
+
"code-davinci-002" => 8001,
|
24
|
+
"gpt-4" => 8192,
|
25
|
+
"gpt-4-0314" => 8192,
|
26
|
+
"gpt-4-0613" => 8192,
|
27
|
+
"gpt-4-32k" => 32768,
|
28
|
+
"gpt-4-32k-0314" => 32768,
|
29
|
+
"gpt-4-32k-0613" => 32768,
|
30
|
+
"text-curie-001" => 2049,
|
31
|
+
"text-babbage-001" => 2049,
|
32
|
+
"text-ada-001" => 2049,
|
33
|
+
"davinci" => 2049,
|
34
|
+
"curie" => 2049,
|
35
|
+
"babbage" => 2049,
|
36
|
+
"ada" => 2049
|
37
|
+
}.freeze
|
38
|
+
|
39
|
+
def self.token_limit(model_name)
|
40
|
+
TOKEN_LIMITS[model_name]
|
41
|
+
end
|
42
|
+
def self.validate_max_tokens!(content, model_name, options = {})
|
43
|
+
text_token_length = if content.is_a?(Array)
|
44
|
+
content.sum { |item| token_length(item.to_json, model_name, options) }
|
45
|
+
else
|
46
|
+
token_length(content, model_name, options)
|
47
|
+
end
|
48
|
+
|
49
|
+
leftover_tokens = token_limit(model_name) - text_token_length
|
50
|
+
|
51
|
+
if leftover_tokens < 0
|
52
|
+
raise "This model's maximum context length is #{token_limit(model_name)} tokens, but the given text is #{text_token_length} tokens long."
|
53
|
+
end
|
54
|
+
|
55
|
+
leftover_tokens
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.token_length(text, model_name = "", options = {})
|
59
|
+
settings = BxBuilderChain::Utils::Tokenization::OpenAiEncodings.cl100k_base
|
60
|
+
encoder = BxBuilderChain::Utils::Tokenization::BytePairEncoding.new(
|
61
|
+
pat_str: settings["pat_str"],
|
62
|
+
mergeable_ranks: settings["mergeable_ranks"]
|
63
|
+
)
|
64
|
+
|
65
|
+
encoder.encode(text).count
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module BxBuilderChain
|
2
|
+
module Utils
|
3
|
+
module Tokenization
|
4
|
+
class BytePairEncoding
|
5
|
+
def initialize(pat_str:, mergeable_ranks:)
|
6
|
+
@pat_str = pat_str
|
7
|
+
@mergeable_ranks = mergeable_ranks
|
8
|
+
@decoder = mergeable_ranks.invert
|
9
|
+
@pat = Regexp.new(pat_str)
|
10
|
+
@tokenized_string = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def encode(text, visualise: nil)
|
14
|
+
words = text.scan(@pat)
|
15
|
+
words.flat_map { |word| bpe_encode(word.bytes, visualise: visualise) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def visualise_tokenised_string
|
19
|
+
visualise_tokens_coloured(@tokenized_string) unless @tokenized_string.empty?
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def bpe_encode(input, visualise: nil)
|
25
|
+
parts = input.map(&:chr)
|
26
|
+
visualiser = select_visualiser(visualise)
|
27
|
+
|
28
|
+
until (min_pair = find_min_rank_pair(parts)).nil?
|
29
|
+
visualiser.call(parts)
|
30
|
+
idx = min_pair[:index]
|
31
|
+
parts[idx, 2] = [parts[idx] + parts[idx + 1]]
|
32
|
+
end
|
33
|
+
|
34
|
+
@tokenized_string.concat(parts)
|
35
|
+
puts '' unless visualise.nil?
|
36
|
+
parts.map { |part| @mergeable_ranks[part] }
|
37
|
+
end
|
38
|
+
|
39
|
+
def find_min_rank_pair(parts)
|
40
|
+
pairs = parts.each_cons(2).with_index.map do |pair, i|
|
41
|
+
rank = @mergeable_ranks[pair.join]
|
42
|
+
{ pair: pair, rank: rank, index: i } if rank
|
43
|
+
end.compact
|
44
|
+
pairs.min_by { |entry| entry[:rank] }
|
45
|
+
end
|
46
|
+
|
47
|
+
def select_visualiser(mode)
|
48
|
+
case mode
|
49
|
+
when 'colour', 'color'
|
50
|
+
method(:visualise_tokens_coloured)
|
51
|
+
when 'simple'
|
52
|
+
method(:visualise_tokens_simple)
|
53
|
+
else
|
54
|
+
->(_tokens) {}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def visualise_tokens_coloured(token_values)
|
59
|
+
backgrounds = [167, 179, 185, 77, 80, 68, 134].cycle
|
60
|
+
output = token_values.each_with_index.map do |value, index|
|
61
|
+
"\e[48;5;#{backgrounds.next}m#{value}"
|
62
|
+
end.join
|
63
|
+
puts "#{output}\e[0m"
|
64
|
+
end
|
65
|
+
|
66
|
+
def visualise_tokens_simple(token_values)
|
67
|
+
puts token_values
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module BxBuilderChain
|
4
|
+
module Utils
|
5
|
+
module Tokenization
|
6
|
+
class OpenAiEncodings
|
7
|
+
|
8
|
+
|
9
|
+
ENDOFTEXT = "<|endoftext|>"
|
10
|
+
FIM_PREFIX = "<|fim_prefix|>"
|
11
|
+
FIM_MIDDLE = "<|fim_middle|>"
|
12
|
+
FIM_SUFFIX = "<|fim_suffix|>"
|
13
|
+
ENDOFPROMPT = "<|endofprompt|>"
|
14
|
+
|
15
|
+
def self.load_tiktoken_bpe(tiktoken_bpe_file)
|
16
|
+
contents = File.read(tiktoken_bpe_file)
|
17
|
+
contents.split("\n").each_with_object({}) do |line, hash|
|
18
|
+
token, rank = line.split
|
19
|
+
hash[Base64.decode64(token)] = rank.to_i
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.cl100k_base
|
24
|
+
mergeable_ranks = load_tiktoken_bpe(File.join(__dir__, '..', 'token_data', 'cl100k_base.tiktoken'))
|
25
|
+
|
26
|
+
special_tokens = {
|
27
|
+
ENDOFTEXT => 100257,
|
28
|
+
FIM_PREFIX => 100258,
|
29
|
+
FIM_MIDDLE => 100259,
|
30
|
+
FIM_SUFFIX => 100260,
|
31
|
+
ENDOFPROMPT => 100276,
|
32
|
+
}
|
33
|
+
|
34
|
+
{
|
35
|
+
"name" => "cl100k_base",
|
36
|
+
"pat_str" => /(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/,
|
37
|
+
"mergeable_ranks" => mergeable_ranks,
|
38
|
+
"special_tokens" => special_tokens
|
39
|
+
}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain::Vectorsearch
|
4
|
+
# = Vector Databases
|
5
|
+
# A vector database a type of database that stores data as high-dimensional vectors, which are mathematical representations of features or attributes. Each vector has a certain number of dimensions, which can range from tens to thousands, depending on the complexity and granularity of the data.
|
6
|
+
#
|
7
|
+
# == Available vector databases
|
8
|
+
#
|
9
|
+
# - {BxBuilderChain::Vectorsearch::Pgvector}
|
10
|
+
#
|
11
|
+
# == Usage
|
12
|
+
#
|
13
|
+
# 1. Pick a vector database from list.
|
14
|
+
# 2. Review its documentation to install the required gems, and create an account, get an API key, etc
|
15
|
+
# 3. Instantiate the vector database class:
|
16
|
+
#
|
17
|
+
# weaviate = BxBuilderChain::Vectorsearch::Weaviate.new(
|
18
|
+
# url: ENV["WEAVIATE_URL"],
|
19
|
+
# api_key: ENV["WEAVIATE_API_KEY"],
|
20
|
+
# table_name: "Documents",
|
21
|
+
# llm: :openai, # or :cohere, :hugging_face, :google_palm, or :replicate
|
22
|
+
# llm_api_key: ENV["OPENAI_API_KEY"] # API key for the selected LLM
|
23
|
+
# )
|
24
|
+
#
|
25
|
+
# # You can instantiate other supported vector databases the same way:
|
26
|
+
# milvus = BxBuilderChain::Vectorsearch::Milvus.new(...)
|
27
|
+
# qdrant = BxBuilderChain::Vectorsearch::Qdrant.new(...)
|
28
|
+
# pinecone = BxBuilderChain::Vectorsearch::Pinecone.new(...)
|
29
|
+
# chrome = BxBuilderChain::Vectorsearch::Chroma.new(...)
|
30
|
+
# pgvector = BxBuilderChain::Vectorsearch::Pgvector.new(...)
|
31
|
+
#
|
32
|
+
# == Schema Creation
|
33
|
+
#
|
34
|
+
# `create_default_schema()` creates default schema in your vector database.
|
35
|
+
#
|
36
|
+
# search.create_default_schema
|
37
|
+
#
|
38
|
+
# (We plan on offering customizable schema creation shortly)
|
39
|
+
#
|
40
|
+
# == Adding Data
|
41
|
+
#
|
42
|
+
# You can add data with:
|
43
|
+
# 1. `add_data(path:, paths:)` to add any kind of data type
|
44
|
+
#
|
45
|
+
# my_pdf = BxBuilderChain.root.join("path/to/my.pdf")
|
46
|
+
# my_text = BxBuilderChain.root.join("path/to/my.txt")
|
47
|
+
# my_docx = BxBuilderChain.root.join("path/to/my.docx")
|
48
|
+
# my_csv = BxBuilderChain.root.join("path/to/my.csv")
|
49
|
+
#
|
50
|
+
# search.add_data(paths: [my_pdf, my_text, my_docx, my_csv])
|
51
|
+
#
|
52
|
+
# 2. `add_texts(texts:)` to only add textual data
|
53
|
+
#
|
54
|
+
# search.add_texts(
|
55
|
+
# texts: [
|
56
|
+
# "Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
|
57
|
+
# "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s"
|
58
|
+
# ]
|
59
|
+
# )
|
60
|
+
#
|
61
|
+
# == Retrieving Data
|
62
|
+
#
|
63
|
+
# `similarity_search_by_vector(embedding:, k:)` searches the vector database for the closest `k` number of embeddings.
|
64
|
+
#
|
65
|
+
# search.similarity_search_by_vector(
|
66
|
+
# embedding: ...,
|
67
|
+
# k: # number of results to be retrieved
|
68
|
+
# )
|
69
|
+
#
|
70
|
+
# `vector_store.similarity_search(query:, k:)` generates an embedding for the query and searches the vector database for the closest `k` number of embeddings.
|
71
|
+
#
|
72
|
+
# search.similarity_search_by_vector(
|
73
|
+
# embedding: ...,
|
74
|
+
# k: # number of results to be retrieved
|
75
|
+
# )
|
76
|
+
#
|
77
|
+
# `ask(question:)` generates an embedding for the passed-in question, searches the vector database for closest embeddings and then passes these as context to the LLM to generate an answer to the question.
|
78
|
+
#
|
79
|
+
# search.ask(question: "What is lorem ipsum?")
|
80
|
+
#
|
81
|
+
class Base
|
82
|
+
include BxBuilderChain::DependencyHelper
|
83
|
+
|
84
|
+
attr_reader :client, :table_name, :llm
|
85
|
+
|
86
|
+
DEFAULT_METRIC = "cosine"
|
87
|
+
|
88
|
+
# @param llm [Object] The LLM client to use
|
89
|
+
def initialize(llm:)
|
90
|
+
@llm = llm
|
91
|
+
end
|
92
|
+
|
93
|
+
# Method supported by Vectorsearch DB to retrieve a default schema
|
94
|
+
def get_default_schema
|
95
|
+
raise NotImplementedError, "#{self.class.name} does not support retrieving a default schema"
|
96
|
+
end
|
97
|
+
|
98
|
+
# Method supported by Vectorsearch DB to create a default schema
|
99
|
+
def create_default_schema
|
100
|
+
raise NotImplementedError, "#{self.class.name} does not support creating a default schema"
|
101
|
+
end
|
102
|
+
|
103
|
+
# Method supported by Vectorsearch DB to delete the default schema
|
104
|
+
def destroy_default_schema
|
105
|
+
raise NotImplementedError, "#{self.class.name} does not support deleting a default schema"
|
106
|
+
end
|
107
|
+
|
108
|
+
# Method supported by Vectorsearch DB to add a list of texts to the index
|
109
|
+
def add_texts(**kwargs)
|
110
|
+
raise NotImplementedError, "#{self.class.name} does not support adding texts"
|
111
|
+
end
|
112
|
+
|
113
|
+
# Method supported by Vectorsearch DB to update a list of texts to the index
|
114
|
+
def update_texts(**kwargs)
|
115
|
+
raise NotImplementedError, "#{self.class.name} does not support updating texts"
|
116
|
+
end
|
117
|
+
|
118
|
+
# Method supported by Vectorsearch DB to search for similar texts in the index
|
119
|
+
def similarity_search(**kwargs)
|
120
|
+
raise NotImplementedError, "#{self.class.name} does not support similarity search"
|
121
|
+
end
|
122
|
+
|
123
|
+
# Method supported by Vectorsearch DB to search for similar texts in the index by the passed in vector.
|
124
|
+
# You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.
|
125
|
+
def similarity_search_by_vector(**kwargs)
|
126
|
+
raise NotImplementedError, "#{self.class.name} does not support similarity search by vector"
|
127
|
+
end
|
128
|
+
|
129
|
+
# Method supported by Vectorsearch DB to answer a question given a context (data) pulled from your Vectorsearch DB.
|
130
|
+
def ask(**kwargs)
|
131
|
+
raise NotImplementedError, "#{self.class.name} does not support asking questions"
|
132
|
+
end
|
133
|
+
|
134
|
+
def generate_prompt(question:, context:, prompt_template: nil)
|
135
|
+
template = prompt_template || BxBuilderChain.configuration.default_prompt_template
|
136
|
+
template % {context: context, question: question}
|
137
|
+
end
|
138
|
+
|
139
|
+
def add_data(paths:)
|
140
|
+
raise ArgumentError, "Paths must be provided" if Array(paths).empty?
|
141
|
+
|
142
|
+
texts = Array(paths)
|
143
|
+
.flatten
|
144
|
+
.map do |path|
|
145
|
+
data = BxBuilderChain::Loader.new(path)&.load&.chunks
|
146
|
+
data.map { |chunk| chunk[:text] }
|
147
|
+
end
|
148
|
+
|
149
|
+
texts.flatten!
|
150
|
+
|
151
|
+
add_texts(texts: texts)
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.logger_options
|
155
|
+
{
|
156
|
+
color: :blue
|
157
|
+
}
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BxBuilderChain::Vectorsearch
|
4
|
+
class Pgvector < Base
|
5
|
+
#
|
6
|
+
# The PostgreSQL vector search adapter
|
7
|
+
#
|
8
|
+
# Usage:
|
9
|
+
# pgvector = Langchain::Vectorsearch::Pgvector.new(url:, table_name:, llm:, namespace: nil)
|
10
|
+
#
|
11
|
+
|
12
|
+
# The operators supported by the PostgreSQL vector search adapter
|
13
|
+
OPERATORS = {
|
14
|
+
"cosine_distance" => "cosine",
|
15
|
+
"euclidean_distance" => "euclidean"
|
16
|
+
}
|
17
|
+
DEFAULT_OPERATOR = "cosine_distance"
|
18
|
+
|
19
|
+
attr_reader :db, :operator, :table_name, :namespace_column, :namespaces, :documents_table
|
20
|
+
|
21
|
+
# @param url [String] The URL of the PostgreSQL database
|
22
|
+
# @param table_name [String] The name of the table to use for the index
|
23
|
+
# @param llm [Object] The LLM client to use
|
24
|
+
# @param namespace [String] The namespace to use for the index when inserting/querying
|
25
|
+
def initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public'])
|
26
|
+
|
27
|
+
depends_on "sequel"
|
28
|
+
require "sequel"
|
29
|
+
|
30
|
+
@db = Sequel.connect(BxBuilderChain.configuration.pg_url)
|
31
|
+
@table_name = "bx_builder_chain_embeddings"
|
32
|
+
@namespace_column = "namespace"
|
33
|
+
set_namespaces(namespaces)
|
34
|
+
@threshold = BxBuilderChain.configuration.threshold
|
35
|
+
|
36
|
+
validate_threshold(@threshold)
|
37
|
+
|
38
|
+
@operator = OPERATORS[DEFAULT_OPERATOR]
|
39
|
+
|
40
|
+
super(llm: llm)
|
41
|
+
end
|
42
|
+
|
43
|
+
def documents_model
|
44
|
+
Class.new(Sequel::Model(@table_name.to_sym)) do
|
45
|
+
plugin :pgvector, :vectors
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Upsert a list of texts to the index
|
50
|
+
# @param texts [Array<String>] The texts to add to the index
|
51
|
+
# @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
|
52
|
+
# @return [PG::Result] The response from the database including the ids of
|
53
|
+
# the added or updated texts.
|
54
|
+
def upsert_texts(texts:, ids:)
|
55
|
+
data = texts.zip(ids).flat_map do |(text, id)|
|
56
|
+
{id: id, content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
|
57
|
+
end
|
58
|
+
# @db[table_name.to_sym].multi_insert(data, return: :primary_key)
|
59
|
+
@db[@table_name.to_sym]
|
60
|
+
.insert_conflict(
|
61
|
+
target: :id,
|
62
|
+
update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
|
63
|
+
)
|
64
|
+
.multi_insert(data, return: :primary_key)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a list of texts to the index
|
68
|
+
# @param texts [Array<String>] The texts to add to the index
|
69
|
+
# @param ids [Array<String>] The ids to add to the index, in the same order as the texts
|
70
|
+
# @return [Array<Integer>] The the ids of the added texts.
|
71
|
+
def add_texts(texts:, ids: nil)
|
72
|
+
if ids.nil? || ids.empty?
|
73
|
+
data = texts.map do |text|
|
74
|
+
{content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
|
75
|
+
end
|
76
|
+
|
77
|
+
@db[@table_name.to_sym].multi_insert(data, return: :primary_key)
|
78
|
+
else
|
79
|
+
upsert_texts(texts: texts, ids: ids)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Update a list of ids and corresponding texts to the index
|
84
|
+
# @param texts [Array<String>] The texts to add to the index
|
85
|
+
# @param ids [Array<String>] The ids to add to the index, in the same order as the texts
|
86
|
+
# @return [Array<Integer>] The ids of the updated texts.
|
87
|
+
def update_texts(texts:, ids:)
|
88
|
+
upsert_texts(texts: texts, ids: ids)
|
89
|
+
end
|
90
|
+
|
91
|
+
def create_default_schema
|
92
|
+
db.run "CREATE EXTENSION IF NOT EXISTS vector"
|
93
|
+
|
94
|
+
namespace_column = @namespace_column
|
95
|
+
vector_dimension = llm.default_dimension || 1000
|
96
|
+
|
97
|
+
# bx_builder_chain_embeddings table
|
98
|
+
db.create_table? :bx_builder_chain_embeddings do
|
99
|
+
primary_key :id
|
100
|
+
text :content
|
101
|
+
column :vectors, "vector(#{vector_dimension})"
|
102
|
+
text namespace_column.to_sym, default: 'public'
|
103
|
+
|
104
|
+
index namespace_column.to_sym
|
105
|
+
end
|
106
|
+
|
107
|
+
# bx_builder_chain_documents table
|
108
|
+
db.create_table? :bx_builder_chain_documents do
|
109
|
+
primary_key :id
|
110
|
+
text :name
|
111
|
+
text namespace_column.to_sym, default: 'public'
|
112
|
+
timestamp :created_at
|
113
|
+
timestamp :updated_at
|
114
|
+
|
115
|
+
index [:name, namespace_column.to_sym], unique: true
|
116
|
+
end
|
117
|
+
|
118
|
+
# bx_builder_chain_document_chunks table
|
119
|
+
db.create_table? :bx_builder_chain_document_chunks do
|
120
|
+
primary_key :id
|
121
|
+
foreign_key :document_id, :bx_builder_chain_documents, null: false, on_delete: :cascade
|
122
|
+
foreign_key :embedding_id, :bx_builder_chain_embeddings, null: false, on_delete: :cascade
|
123
|
+
|
124
|
+
unique [:document_id, :embedding_id]
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# Destroy default schema
|
130
|
+
def destroy_default_schema
|
131
|
+
db.drop_table? :bx_builder_chain_document_chunks
|
132
|
+
db.drop_table? :bx_builder_chain_documents
|
133
|
+
db.drop_table? :bx_builder_chain_embeddings
|
134
|
+
end
|
135
|
+
|
136
|
+
# Search for similar texts in the index
|
137
|
+
# @param query [String] The text to search for
|
138
|
+
# @param k [Integer] The number of top results to return
|
139
|
+
# @return [Array<Hash>] The results of the search
|
140
|
+
def similarity_search(query:, k: 4)
|
141
|
+
embedding = llm.embed(text: query)
|
142
|
+
|
143
|
+
similarity_search_by_vector(
|
144
|
+
embedding: embedding,
|
145
|
+
k: k
|
146
|
+
)
|
147
|
+
end
|
148
|
+
|
149
|
+
# Search for similar texts in the index by the passed in vector.
|
150
|
+
# You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.
|
151
|
+
# @param embedding [Array<Float>] The vector to search for
|
152
|
+
# @param k [Integer] The number of top results to return
|
153
|
+
# @return [Array<Hash>] The results of the search
|
154
|
+
def similarity_search_by_vector(embedding:, k: 4)
|
155
|
+
db.transaction do # BEGIN
|
156
|
+
documents_model
|
157
|
+
.nearest_neighbors(:vectors, embedding, distance: operator, threshold: @threshold)
|
158
|
+
.where(@namespace_column.to_sym => namespaces)
|
159
|
+
.limit(k)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Ask a question and return the answer
|
164
|
+
# @param question [String] The question to ask
|
165
|
+
# @return [String] The answer to the question
|
166
|
+
def ask(question:, context_results: 4, prompt_template: nil)
|
167
|
+
search_results = similarity_search(query: question, k: context_results)
|
168
|
+
|
169
|
+
context = search_results.map do |result|
|
170
|
+
result.content.to_s
|
171
|
+
end
|
172
|
+
context = context.join("\n---\n")
|
173
|
+
|
174
|
+
prompt = generate_prompt(question: question, context: context, prompt_template: nil)
|
175
|
+
|
176
|
+
llm.chat(prompt: prompt)
|
177
|
+
end
|
178
|
+
|
179
|
+
def add_data(paths:)
|
180
|
+
raise ArgumentError, "Paths must be provided" if Array(paths).empty?
|
181
|
+
|
182
|
+
all_added_chunk_ids = []
|
183
|
+
|
184
|
+
@db.transaction do # Start the transaction
|
185
|
+
paths.each do |file_n_path|
|
186
|
+
path, file = extract_path_and_file(file_n_path)
|
187
|
+
|
188
|
+
texts = BxBuilderChain::Loader.new(path)&.load&.chunks.map { |chunk| chunk[:text] }
|
189
|
+
|
190
|
+
texts.flatten!
|
191
|
+
|
192
|
+
added_chunk_ids_for_current_path = add_texts(texts: texts)
|
193
|
+
|
194
|
+
all_added_chunk_ids.concat(added_chunk_ids_for_current_path)
|
195
|
+
|
196
|
+
document_record_id = @db[:bx_builder_chain_documents].insert(
|
197
|
+
name: file,
|
198
|
+
namespace: namespaces[0],
|
199
|
+
created_at: Time.now.utc,
|
200
|
+
updated_at: Time.now.utc
|
201
|
+
)
|
202
|
+
|
203
|
+
document_chunks_data = added_chunk_ids_for_current_path.map do |chunk_id|
|
204
|
+
{document_id: document_record_id, embedding_id: chunk_id}
|
205
|
+
end
|
206
|
+
@db[:bx_builder_chain_document_chunks].multi_insert(document_chunks_data)
|
207
|
+
end
|
208
|
+
end # End the transaction
|
209
|
+
|
210
|
+
all_added_chunk_ids
|
211
|
+
end
|
212
|
+
|
213
|
+
private
|
214
|
+
|
215
|
+
def extract_path_and_file(entry)
|
216
|
+
entry.is_a?(Hash) ? [entry[:path], entry[:filename]] : [entry, entry]
|
217
|
+
end
|
218
|
+
|
219
|
+
def validate_threshold(threshold)
|
220
|
+
raise "Threshold must be between 0.0 and 2.0 (0.0 being a perfect match) or nil (ignore threshold)" if !threshold.nil? && (threshold > 2 || threshold < 0)
|
221
|
+
end
|
222
|
+
|
223
|
+
def set_namespaces(namespaces)
|
224
|
+
namespaces = [namespaces] unless namespaces.is_a?(Array)
|
225
|
+
@namespaces = namespaces.push(BxBuilderChain.configuration.public_namespace).uniq
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
require 'dotenv'
|
5
|
+
require "zeitwerk"
|
6
|
+
|
7
|
+
Dotenv.load
|
8
|
+
loader = Zeitwerk::Loader.for_gem(warn_on_extra_files: false)
|
9
|
+
loader.ignore("#{__dir__}/bx_builder_chain.rb")
|
10
|
+
loader.setup
|
11
|
+
|
12
|
+
module BxBuilderChain
|
13
|
+
class << self
|
14
|
+
attr_reader :logger
|
15
|
+
attr_writer :configuration
|
16
|
+
|
17
|
+
def logger=(logger)
|
18
|
+
@logger ||= logger
|
19
|
+
end
|
20
|
+
|
21
|
+
def configuration
|
22
|
+
@configuration ||= Configuration.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def configure
|
26
|
+
yield(configuration)
|
27
|
+
end
|
28
|
+
|
29
|
+
def reset_config
|
30
|
+
@configuration = Configuration.new
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
self.logger = ::Logger.new($stdout, level: :warn)
|
35
|
+
|
36
|
+
class Error < StandardError; end
|
37
|
+
|
38
|
+
end
|