boxcars 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c58233487372c9e99f160786cc18f8e5464262479110ae2fcf2048fc09f3b338
4
- data.tar.gz: 237d72649f3faef07c493e93f514077afd144578782fc8861f5a0b0b40152828
3
+ metadata.gz: 69b70e1d02b1ec206438eaaf857a0495fe35ab01e64a265656fe21230675306f
4
+ data.tar.gz: 8681b9625a0684f1091eea7a4626964929b271370068c90b82dabeee4253d803
5
5
  SHA512:
6
- metadata.gz: a3c148dc072fea12862849183bfc37f4189639982e8a7409afab267aa22baed5b911dff223562705be42c5d57e06e76e6e8814b5366c120444fca529fe204187
7
- data.tar.gz: acec67f2d4b5be3b660c2c6e8d90ef772925903f053f0a62965fc61f335a4fc84bbeee47265c49a596945e846d31ecad3b5a93602521037f91ed689c402712d8
6
+ metadata.gz: eb5c0c00f8fcdbbd6d8a1999d7544fc584c701fdf9a8a9c271fff6d9795f75ef9cab058fee2c6829808a764c892cc3e2f4e4a8717155d34d6514b46d744e632c
7
+ data.tar.gz: b8fb4ad34d7b93d47388f037d1d93e9e7245303740bc04d58d21942112ff97315e5dd31fdba77e275b52ddba85ca1055b897646ae1e606daee485583d50c52a6
data/CHANGELOG.md CHANGED
@@ -2,7 +2,22 @@
2
2
 
3
3
  ## [Unreleased](https://github.com/BoxcarsAI/boxcars/tree/HEAD)
4
4
 
5
- [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...HEAD)
5
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.7...HEAD)
6
+
7
+ **Closed issues:**
8
+
9
+ - Getting the same verbosity as in the examples [\#54](https://github.com/BoxcarsAI/boxcars/issues/54)
10
+
11
+ **Merged pull requests:**
12
+
13
+ - Add Engine for Gpt4all [\#55](https://github.com/BoxcarsAI/boxcars/pull/55) ([francis](https://github.com/francis))
14
+ - update google search to return URL for result if present [\#53](https://github.com/BoxcarsAI/boxcars/pull/53) ([francis](https://github.com/francis))
15
+ - Draft: added gpt4all [\#49](https://github.com/BoxcarsAI/boxcars/pull/49) ([jaigouk](https://github.com/jaigouk))
16
+ - Embeddings with hnswlib [\#48](https://github.com/BoxcarsAI/boxcars/pull/48) ([jaigouk](https://github.com/jaigouk))
17
+
18
+ ## [v0.2.7](https://github.com/BoxcarsAI/boxcars/tree/v0.2.7) (2023-04-13)
19
+
20
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...v0.2.7)
6
21
 
7
22
  **Closed issues:**
8
23
 
data/Gemfile CHANGED
@@ -30,3 +30,5 @@ gem "faraday-retry", "~> 2.0"
30
30
  gem "activesupport", "~> 7.0"
31
31
 
32
32
  gem "rest-client", "~> 2.1"
33
+
34
+ gem "hnswlib", "~> 0.8.1"
data/Gemfile.lock CHANGED
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- boxcars (0.2.7)
4
+ boxcars (0.2.8)
5
5
  google_search_results (~> 2.2)
6
+ gpt4all (~> 0.0.4)
6
7
  ruby-openai (~> 3.0)
7
8
 
8
9
  GEM
@@ -71,7 +72,12 @@ GEM
71
72
  rainbow (>= 2.2.1)
72
73
  rake (>= 10.0)
73
74
  google_search_results (2.2.0)
75
+ gpt4all (0.0.5)
76
+ faraday (~> 2.7)
77
+ os (~> 1.1)
78
+ tty-progressbar (~> 0.18.2)
74
79
  hashdiff (1.0.1)
80
+ hnswlib (0.8.1)
75
81
  http-accept (1.7.0)
76
82
  http-cookie (1.0.5)
77
83
  domain_name (~> 0.5)
@@ -100,6 +106,7 @@ GEM
100
106
  octokit (4.25.1)
101
107
  faraday (>= 1, < 3)
102
108
  sawyer (~> 0.9)
109
+ os (1.1.4)
103
110
  parallel (1.22.1)
104
111
  parser (3.2.1.1)
105
112
  ast (~> 2.4.1)
@@ -163,10 +170,19 @@ GEM
163
170
  faraday (>= 0.17.3, < 3)
164
171
  sqlite3 (1.6.2)
165
172
  mini_portile2 (~> 2.8.0)
173
+ sqlite3 (1.6.2-arm64-darwin)
166
174
  sqlite3 (1.6.2-x86_64-darwin)
167
175
  sqlite3 (1.6.2-x86_64-linux)
176
+ strings-ansi (0.2.0)
168
177
  timers (4.3.5)
169
178
  traces (0.9.1)
179
+ tty-cursor (0.7.1)
180
+ tty-progressbar (0.18.2)
181
+ strings-ansi (~> 0.2)
182
+ tty-cursor (~> 0.7)
183
+ tty-screen (~> 0.8)
184
+ unicode-display_width (>= 1.6, < 3.0)
185
+ tty-screen (0.8.1)
170
186
  tzinfo (2.0.6)
171
187
  concurrent-ruby (~> 1.0)
172
188
  unf (0.1.4)
@@ -181,6 +197,7 @@ GEM
181
197
  hashdiff (>= 0.4.0, < 2.0.0)
182
198
 
183
199
  PLATFORMS
200
+ arm64-darwin-22
184
201
  universal-java-11
185
202
  x86_64-darwin-21
186
203
  x86_64-darwin-22
@@ -194,6 +211,7 @@ DEPENDENCIES
194
211
  dotenv (~> 2.8)
195
212
  faraday-retry (~> 2.0)
196
213
  github_changelog_generator (~> 1.16)
214
+ hnswlib (~> 0.8.1)
197
215
  rake (~> 13.0)
198
216
  rest-client (~> 2.1)
199
217
  rspec (~> 3.2)
data/README.md CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  <h4 align="center">
4
4
  <a href="https://www.boxcars.ai">Website</a> |
5
- <a href="https://www.boxcars.ai/roadmap">Roadmap</a> |
6
5
  <a href="https://www.boxcars.ai/blog">Blog</a> |
7
6
  <a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
8
7
  </h4>
data/boxcars.gemspec CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
37
37
 
38
38
  # runtime dependencies
39
39
  spec.add_dependency "google_search_results", "~> 2.2"
40
+ spec.add_dependency "gpt4all", "~> 0.0.4"
40
41
  spec.add_dependency "ruby-openai", "~> 3.0"
41
42
 
42
43
  # For more information and examples about making a new gem, checkout our
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module Embeddings
5
+ class Document
6
+ attr_accessor :page_content, :metadata
7
+
8
+ def initialize(fields = {})
9
+ @page_content = fields[:page_content] || ""
10
+ @metadata = fields[:metadata] || {}
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'openai'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ class EmbedViaOpenAI
8
+ include Embeddings
9
+
10
+ attr_accessor :texts, :openai_connection, :model
11
+
12
+ def initialize(texts:, openai_connection:, model: 'text-embedding-ada-002')
13
+ validate_params(texts, openai_connection)
14
+ @texts = texts
15
+ @openai_connection = openai_connection
16
+ @model = model
17
+ end
18
+
19
+ def call
20
+ texts.map do |text|
21
+ embedding = embedding_with_retry(model: model, input: strip_new_lines(text))
22
+ {
23
+ embedding: embedding,
24
+ dim: embedding.size
25
+ }
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def validate_params(texts, openai_connection)
32
+ raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
33
+ raise_error 'openai_connection must be an OpenAI::Client' unless openai_connection.is_a?(OpenAI::Client)
34
+ end
35
+
36
+ def embedding_with_retry(request)
37
+ response = @openai_connection.embeddings(parameters: request)
38
+ response['data'][0]['embedding']
39
+ end
40
+
41
+ def strip_new_lines(text)
42
+ text.gsub("\n", ' ')
43
+ end
44
+
45
+ def raise_error(message)
46
+ raise ::Boxcars::ValueError, message
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'hnswlib'
5
+ require 'json'
6
+
7
+ module Boxcars
8
+ module Embeddings
9
+ module Hnswlib
10
+ class BuildVectorStore
11
+ include Embeddings
12
+
13
+ # This class is responsible for building the vector store for the hnswlib similarity search.
14
+ # It will load the training data, generate the embeddings, and save the vector store.
15
+ # It will also load the vector store into memory.
16
+ # For later use, it will save the splitted document with index numbers to a json file.
17
+ #
18
+ # @param training_data_path [String] The path to the training data. Can be a glob pattern.
19
+ # @param index_file_path [String] The path to the index file.
20
+ # @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
21
+ # @option json_doc_file_path [String]. The json file containing the document text.
22
+ # if nil, it will reuse index file name.
23
+ # @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
24
+ def initialize(
25
+ training_data_path:,
26
+ index_file_path:,
27
+ split_chunk_size: 2000,
28
+ json_doc_file_path: nil,
29
+ force_rebuild: true
30
+ )
31
+ @training_data_path = training_data_path
32
+ @index_file_path = index_file_path
33
+ @split_chunk_size = split_chunk_size
34
+ @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
35
+ @force_rebuild = force_rebuild
36
+ end
37
+
38
+ def call
39
+ validate_params
40
+ data = load_files
41
+ documents = split_text_into_chunks(data)
42
+ embeddings_with_config = generate_embeddings(documents)
43
+ save_vector_store(embeddings_with_config)
44
+ load_hnsw
45
+ end
46
+
47
+ private
48
+
49
+ attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
50
+
51
+ def validate_params
52
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
53
+ raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
54
+ raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
55
+
56
+ index_dir = File.dirname(index_file_path)
57
+ raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
58
+
59
+ raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
60
+ end
61
+
62
+ def load_files
63
+ data = []
64
+ files = Dir.glob(training_data_path)
65
+ raise_error "No files found at #{training_data_path}" if files.empty?
66
+
67
+ files.each do |file|
68
+ data << File.read(file)
69
+ end
70
+ puts "Added #{files.length} files to data. Splitting text into chunks..."
71
+ data
72
+ end
73
+
74
+ def split_text_into_chunks(data)
75
+ return true unless rebuild_required?
76
+
77
+ docs = []
78
+ data.each do |chunk|
79
+ doc_output = Boxcars::Embeddings::SplitText.call(
80
+ separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
81
+ )
82
+ docs.concat(doc_output)
83
+ end
84
+ docs
85
+ end
86
+
87
+ def rebuild_required?
88
+ hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
89
+ return true unless File.exist?(index_file_path)
90
+ return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
91
+ return true if force_rebuild
92
+
93
+ false
94
+ end
95
+
96
+ def generate_embeddings(documents)
97
+ return true unless rebuild_required?
98
+
99
+ puts "Initializing Store..."
100
+ openai_client = OpenAI::Client.new(access_token: ENV.fetch('OPENAI_API_KEY', nil))
101
+
102
+ embeddings_with_dim = Boxcars::Embeddings::EmbedViaOpenAI.call(texts: documents, openai_connection: openai_client)
103
+
104
+ document_embeddings = embeddings_with_dim.map.with_index do |item, index|
105
+ { doc_id: index, embedding: item[:embedding], document: documents[index] }
106
+ end
107
+
108
+ { document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
109
+ end
110
+
111
+ def save_vector_store(embeddings_with_config)
112
+ return true unless rebuild_required?
113
+
114
+ puts "Saving Vectorstore"
115
+ Boxcars::Embeddings::Hnswlib::SaveToHnswlib.call(
116
+ document_embeddings: embeddings_with_config[:document_embeddings],
117
+ index_file_path: index_file_path,
118
+ json_doc_file_path: json_doc_file_path,
119
+ hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
120
+ )
121
+ puts "VectorStore saved"
122
+ end
123
+
124
+ def hnswlib_config(dim)
125
+ # dim: length of datum point vector that will be indexed.
126
+ Boxcars::Embeddings::Hnswlib::HnswlibConfig.new(
127
+ metric: "l2", max_item: 10000, dim: dim
128
+ )
129
+ end
130
+
131
+ def load_hnsw
132
+ puts "Loading Hnswlib"
133
+
134
+ config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
135
+ json_config = parse_json_file(config_file)
136
+ document_embeddings = parse_json_file(json_doc_file_path)
137
+
138
+ search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
139
+ search_index.load_index(index_file_path)
140
+
141
+ { vector_store: search_index, document_embeddings: document_embeddings }
142
+ end
143
+
144
+ def parse_json_file(file_path)
145
+ return [] if file_path.nil?
146
+
147
+ file_content = File.read(file_path)
148
+ JSON.parse(file_content, symbolize_names: true)
149
+ rescue JSON::ParserError => e
150
+ raise_error("Error parsing hnswlib_config.json: #{e.message}")
151
+ end
152
+
153
+ def raise_error(message)
154
+ raise ::Boxcars::Error, message
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ module Hnswlib
8
+ class HnswlibConfig
9
+ attr_reader :metric, :max_item, :dim, :ef_construction, :m
10
+
11
+ # used for search index.
12
+ #
13
+ # @param max_item [Integer] The maximum number of items.
14
+ #
15
+ # @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
16
+ #
17
+ # @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
18
+ # It controls the index time/accuracy trade-off.
19
+ #
20
+ # @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
21
+ #
22
+ # reference: https://yoshoku.github.io/hnswlib.rb/doc/
23
+ def initialize(
24
+ metric: "l2",
25
+ max_item: 10000,
26
+ dim: 2,
27
+ ef_construction: 200,
28
+ max_outgoing_connection: 16
29
+ )
30
+ @metric = metric
31
+ @max_item = max_item
32
+ @dim = dim
33
+ @ef_construction = ef_construction
34
+ @max_outgoing_connection = max_outgoing_connection
35
+ end
36
+
37
+ def space
38
+ @metric == 'dot' ? 'ip' : 'l2'
39
+ end
40
+
41
+ def to_json(*args)
42
+ JSON.pretty_generate(
43
+ {
44
+ metric: @metric,
45
+ max_item: @max_item,
46
+ dim: @dim,
47
+ ef_construction: @ef_construction,
48
+ max_outgoing_connection: @max_outgoing_connection
49
+ },
50
+ *args
51
+ )
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+ require 'json'
5
+
6
+ module Boxcars
7
+ module Embeddings
8
+ module Hnswlib
9
+ class HnswlibSearch
10
+ def initialize(vector_store:, options: {})
11
+ validate_params(vector_store)
12
+ @vector_store = vector_store
13
+ @json_doc_path = options[:json_doc_path]
14
+ @num_neighbors = options[:num_neighbors] || 1
15
+ end
16
+
17
+ def call(query)
18
+ search(query)
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :json_doc_path, :vector_store, :num_neighbors
24
+
25
+ def validate_params(vector_store)
26
+ raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
27
+ end
28
+
29
+ def search(query)
30
+ raw_results = vector_store.search_knn(query, num_neighbors)
31
+ raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
32
+ end
33
+
34
+ def lookup_embedding2(doc_id, distance)
35
+ embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
36
+ return unless embedding_data
37
+
38
+ { document: embedding_data[:document], distance: distance }
39
+ end
40
+
41
+ def parsed_data
42
+ @parsed_data ||= JSON.parse(
43
+ File.read(json_doc_path),
44
+ symbolize_names: true
45
+ )
46
+ end
47
+
48
+ def raise_error(message)
49
+ raise ArgumentError, message
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+ require 'json'
5
+ require 'fileutils'
6
+
7
+ module Boxcars
8
+ module Embeddings
9
+ module Hnswlib
10
+ class SaveToHnswlib
11
+ include Embeddings
12
+
13
+ # @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
14
+ # @param index_file_path [String] The path to the index file.
15
+ # @param hnswlib_config [Boxcars::Embeddings::Hnswlib::Config] The config object for the hnswlib index.
16
+ # @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
17
+ def initialize(document_embeddings:, index_file_path:, hnswlib_config:, json_doc_file_path: nil)
18
+ @document_embeddings = document_embeddings
19
+ @index_file_path = index_file_path
20
+ @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
21
+
22
+ @hnswlib_config = hnswlib_config
23
+ @index = ::Hnswlib::HnswIndex.new(
24
+ n_features: hnswlib_config.dim,
25
+ max_item: hnswlib_config.max_item,
26
+ metric: hnswlib_config.metric
27
+ )
28
+ end
29
+
30
+ def call
31
+ validate_params
32
+ document_texts = []
33
+
34
+ document_embeddings.each do |embedding|
35
+ index.add_item(embedding[:doc_id], embedding[:embedding])
36
+
37
+ document_texts << { doc_id: embedding[:doc_id], embedding: embedding[:embedding], document: embedding[:document] }
38
+ end
39
+
40
+ write_files(index, document_texts)
41
+ end
42
+
43
+ private
44
+
45
+ def write_files(index, document_texts)
46
+ FileUtils.mkdir_p(File.dirname(json_doc_file_path))
47
+ File.write(json_doc_file_path, document_texts.to_json)
48
+
49
+ FileUtils.mkdir_p(File.dirname(index_file_path))
50
+ File.write("#{File.dirname(index_file_path)}/hnswlib_config.json", hnswlib_config.to_json)
51
+
52
+ index.save(index_file_path)
53
+ end
54
+
55
+ attr_reader :index, :document_embeddings, :index_file_path, :json_doc_file_path, :hnswlib_config
56
+
57
+ def validate_params
58
+ raise_error("document_embeddings must be an array") unless document_embeddings.is_a?(Array)
59
+ raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
60
+ raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
61
+
62
+ [index_file_path, json_doc_file_path].each do |path|
63
+ check_parent_directory(path)
64
+ end
65
+ end
66
+
67
+ def check_parent_directory(path)
68
+ return unless path
69
+
70
+ parent_dir = File.dirname(path)
71
+ raise_error('parent directory must exist') unless File.directory?(parent_dir)
72
+ end
73
+
74
+ def raise_error(message)
75
+ raise ::Boxcars::ValueError, message
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ class SimilaritySearch
8
+ def initialize(embeddings:, vector_store:, openai_connection:)
9
+ @embeddings = embeddings
10
+ @vector_store = vector_store
11
+ @similarity_search_instance = create_similarity_search_instance
12
+ @openai_connection = openai_connection
13
+ end
14
+
15
+ def call(query:)
16
+ validate_query(query)
17
+ query_vector = convert_query_to_vector(query)
18
+ @similarity_search_instance.call(query_vector)
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :embeddings, :vector_store, :openai_connection
24
+
25
+ def validate_query(query)
26
+ raise_error 'query must be a string' unless query.is_a?(String)
27
+ raise_error 'query must not be empty' if query.empty?
28
+ end
29
+
30
+ def convert_query_to_vector(query)
31
+ Boxcars::Embeddings::EmbedViaOpenAI.call(texts: [query], openai_connection: openai_connection).first[:embedding]
32
+ end
33
+
34
+ def create_similarity_search_instance
35
+ case vector_store
36
+ when ::Hnswlib::HierarchicalNSW
37
+ Boxcars::Embeddings::Hnswlib::HnswlibSearch.new(
38
+ vector_store: vector_store,
39
+ options: { json_doc_path: embeddings, num_neighbors: 2 }
40
+ )
41
+ else
42
+ raise_error 'Unsupported vector store provided'
43
+ end
44
+ end
45
+
46
+ def raise_error(message)
47
+ raise ArgumentError, message
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module Embeddings
5
+ # Split a text into chunks of a given size.
6
+ class SplitText
7
+ include Embeddings
8
+
9
+ attr_reader :separator, :chunk_size, :chunk_overlap, :text
10
+
11
+ # @param separator [String] The string to use to split the text.
12
+ # @param chunk_size [Integer] The size of each chunk.
13
+ # @param chunk_overlap [Integer] The amount of overlap between chunks.
14
+ # @param text [String] The text to split.
15
+ def initialize(separator: "Search", chunk_size: 7, chunk_overlap: 3, text: "")
16
+ # require 'debugger'; debugger
17
+ validate_params(separator, chunk_size, chunk_overlap, text)
18
+
19
+ @separator = separator
20
+ @chunk_size = chunk_size
21
+ @chunk_overlap = chunk_overlap
22
+ @text = text
23
+ end
24
+
25
+ def call
26
+ splits = text.split(separator)
27
+ merged_splits = merge_splits(splits, separator)
28
+
29
+ merged_splits&.sort
30
+ end
31
+
32
+ private
33
+
34
+ def validate_params(separator, chunk_size, chunk_overlap, text)
35
+ raise_error("separator must be a string") unless separator.is_a?(String)
36
+ raise_error("chunk_size must be an integer") unless chunk_size.is_a?(Integer)
37
+ raise_error("chunk_overlap must be an integer") unless chunk_overlap.is_a?(Integer)
38
+ raise_error("text must be a string") unless text.is_a?(String)
39
+ raise_error("chunk_overlap must be less than chunk_size") if chunk_overlap >= chunk_size
40
+ end
41
+
42
+ def raise_error(message)
43
+ raise ::Boxcars::ValueError, message
44
+ end
45
+
46
+ def merge_splits(splits, separator)
47
+ merged_splits = []
48
+ current_doc = []
49
+ total = 0
50
+
51
+ splits.each do |split|
52
+ split_len = split.length
53
+ total = process_split(total, split_len, current_doc, merged_splits, separator)
54
+ current_doc << split
55
+ total += split_len
56
+ end
57
+
58
+ add_remaining_doc(current_doc, merged_splits, separator)
59
+ merged_splits
60
+ end
61
+
62
+ def process_split(total, split_len, current_doc, merged_splits, separator)
63
+ if total + split_len >= chunk_size
64
+ warn_if_chunk_too_large(total)
65
+ total = handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
66
+ end
67
+ total
68
+ end
69
+
70
+ def warn_if_chunk_too_large(total)
71
+ return unless total > chunk_size
72
+
73
+ puts "Created a chunk of size #{total}, which is longer than the specified #{chunk_size}"
74
+ end
75
+
76
+ def handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
77
+ if current_doc.length.positive?
78
+ doc = join_docs(current_doc, separator)
79
+ merged_splits << doc if doc
80
+ total = remove_overlap(total, split_len, current_doc)
81
+ end
82
+ total
83
+ end
84
+
85
+ def remove_overlap(total, split_len, current_doc)
86
+ while total > chunk_overlap || (total + split_len > chunk_size && total.positive?)
87
+ total -= current_doc[0].length
88
+ current_doc.shift
89
+ end
90
+ total
91
+ end
92
+
93
+ def add_remaining_doc(current_doc, merged_splits, separator)
94
+ doc = join_docs(current_doc, separator)
95
+ merged_splits << doc if doc
96
+ end
97
+
98
+ def join_docs(docs, separator)
99
+ text = docs.join(separator).strip
100
+ text.empty? ? nil : text
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Boxcars is a framework for running a series of tools to get an answer to a question.
4
+ module Boxcars
5
+ module Embeddings
6
+ module ClassMethods
7
+ EmbeddingsError = Class.new(StandardError)
8
+
9
+ def call(*args, **kw_args)
10
+ new(*args, **kw_args).call
11
+ end
12
+ end
13
+
14
+ def self.included(base)
15
+ base.extend(ClassMethods)
16
+
17
+ class << base
18
+ private :new
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ require_relative "embeddings/document"
25
+ require_relative "embeddings/embed_via_open_ai"
26
+ require_relative "embeddings/split_text"
27
+ require_relative "embeddings/similarity_search"
28
+ require_relative "embeddings/hnswlib/hnswlib_config"
29
+ require_relative "embeddings/hnswlib/save_to_hnswlib"
30
+ require_relative "embeddings/hnswlib/build_vector_store"
31
+ require_relative "embeddings/hnswlib/hnswlib_search"
@@ -57,7 +57,15 @@ module Boxcars
57
57
  raise Error, "Got error from SerpAPI: {res[:error]}" if res[:error]
58
58
 
59
59
  ANSWER_LOCATIONS.each do |path|
60
- return res.dig(*path) if res.dig(*path)
60
+ next unless res.dig(*path)
61
+
62
+ Boxcars.debug("Found SERP answer at #{path}", :cyan)
63
+ path_link = path.dup
64
+ last_word = path_link.pop
65
+ path_link << :link
66
+ return { last_word => res.dig(*path), url: res.dig(*path_link) } if last_word.is_a?(Symbol) && res.dig(*path_link)
67
+
68
+ return res.dig(*path)
61
69
  end
62
70
  "No good search result found"
63
71
  end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Boxcars is a framework for running a series of tools to get an answer to a question.
4
+ module Boxcars
5
+ # For Boxcars that use an engine to do their work.
6
+ class Embedding < Boxcar
7
+ Error = Class.new(StandardError)
8
+ end
9
+ end
10
+
11
+ require "boxcars/boxcar/embeddings"
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'gpt4all'
4
+ # Boxcars is a framework for running a series of tools to get an answer to a question.
5
+ module Boxcars
6
+ # A engine that uses local GPT4All API.
7
+ class Gpt4allEng < Engine
8
+ attr_reader :prompts, :model_kwargs, :batch_size
9
+
10
+ # the default name of the engine
11
+ DEFAULT_NAME = "Gpt4all engine"
12
+ # the default description of the engine
13
+ DEFAULT_DESCRIPTION = "useful for when you need to use local AI to answer questions. " \
14
+ "You should ask targeted questions"
15
+
16
+ # A engine is a container for a single tool to run.
17
+ # @param name [String] The name of the engine. Defaults to "OpenAI engine".
18
+ # @param description [String] A description of the engine. Defaults to:
19
+ # useful for when you need to use AI to answer questions. You should ask targeted questions".
20
+ # @param prompts [Array<String>] The prompts to use when asking the engine. Defaults to [].
21
+ # @param batch_size [Integer] The number of prompts to send to the engine at once. Defaults to 2.
22
+ def initialize(name: DEFAULT_NAME, description: DEFAULT_DESCRIPTION, prompts: [], batch_size: 2, **_kwargs)
23
+ @prompts = prompts
24
+ @batch_size = batch_size
25
+ super(description: description, name: name)
26
+ end
27
+
28
+ # Get an answer from the engine.
29
+ # @param prompt [String] The prompt to use when asking the engine.
30
+ # @param openai_access_token [String] The access token to use when asking the engine.
31
+ # Defaults to Boxcars.configuration.openai_access_token.
32
+ # @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
33
+ def client(prompt:, inputs: {}, **_kwargs)
34
+ gpt4all = Gpt4all::ConversationalAI.new
35
+ gpt4all.prepare_resources(force_download: false)
36
+ gpt4all.start_bot
37
+ input_text = prompt.as_prompt(inputs)[:prompt]
38
+ Boxcars.debug("Prompt after formatting:\n#{input_text}", :cyan) if Boxcars.configuration.log_prompts
39
+ gpt4all.prompt(input_text)
40
+ rescue StandardError => e
41
+ Boxcars.error(["Error from gpt4all engine: #{e}", e.backtrace[-5..-1]].flatten.join("\n "))
42
+ ensure
43
+ gpt4all.stop_bot
44
+ end
45
+
46
+ # get an answer from the engine for a question.
47
+ # @param question [String] The question to ask the engine.
48
+ # @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
49
+ def run(question, **kwargs)
50
+ prompt = Prompt.new(template: question)
51
+ answer = client(prompt: prompt, **kwargs)
52
+ Boxcars.debug("Answer: #{answer}", :cyan)
53
+ answer
54
+ end
55
+ end
56
+ end
@@ -21,3 +21,4 @@ end
21
21
 
22
22
  require "boxcars/engine/engine_result"
23
23
  require "boxcars/engine/openai"
24
+ require "boxcars/engine/gpt4all_eng"
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Boxcars
4
4
  # The current version of the gem.
5
- VERSION = "0.2.7"
5
+ VERSION = "0.2.8"
6
6
  end
data/lib/boxcars.rb CHANGED
@@ -170,3 +170,4 @@ require "boxcars/ruby_repl"
170
170
  require "boxcars/engine"
171
171
  require "boxcars/boxcar"
172
172
  require "boxcars/train"
173
+ require "boxcars/embedding"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boxcars
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Sullivan
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-04-13 00:00:00.000000000 Z
12
+ date: 2023-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: debug
@@ -67,6 +67,20 @@ dependencies:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '2.2'
70
+ - !ruby/object:Gem::Dependency
71
+ name: gpt4all
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: 0.0.4
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: 0.0.4
70
84
  - !ruby/object:Gem::Dependency
71
85
  name: ruby-openai
72
86
  requirement: !ruby/object:Gem::Requirement
@@ -106,6 +120,15 @@ files:
106
120
  - lib/boxcars/boxcar.rb
107
121
  - lib/boxcars/boxcar/active_record.rb
108
122
  - lib/boxcars/boxcar/calculator.rb
123
+ - lib/boxcars/boxcar/embeddings.rb
124
+ - lib/boxcars/boxcar/embeddings/document.rb
125
+ - lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb
126
+ - lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb
127
+ - lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb
128
+ - lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb
129
+ - lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb
130
+ - lib/boxcars/boxcar/embeddings/similarity_search.rb
131
+ - lib/boxcars/boxcar/embeddings/split_text.rb
109
132
  - lib/boxcars/boxcar/engine_boxcar.rb
110
133
  - lib/boxcars/boxcar/google_search.rb
111
134
  - lib/boxcars/boxcar/sql.rb
@@ -113,8 +136,10 @@ files:
113
136
  - lib/boxcars/boxcar/wikipedia_search.rb
114
137
  - lib/boxcars/conversation.rb
115
138
  - lib/boxcars/conversation_prompt.rb
139
+ - lib/boxcars/embedding.rb
116
140
  - lib/boxcars/engine.rb
117
141
  - lib/boxcars/engine/engine_result.rb
142
+ - lib/boxcars/engine/gpt4all_eng.rb
118
143
  - lib/boxcars/engine/openai.rb
119
144
  - lib/boxcars/generation.rb
120
145
  - lib/boxcars/prompt.rb