boxcars 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c58233487372c9e99f160786cc18f8e5464262479110ae2fcf2048fc09f3b338
4
- data.tar.gz: 237d72649f3faef07c493e93f514077afd144578782fc8861f5a0b0b40152828
3
+ metadata.gz: 69b70e1d02b1ec206438eaaf857a0495fe35ab01e64a265656fe21230675306f
4
+ data.tar.gz: 8681b9625a0684f1091eea7a4626964929b271370068c90b82dabeee4253d803
5
5
  SHA512:
6
- metadata.gz: a3c148dc072fea12862849183bfc37f4189639982e8a7409afab267aa22baed5b911dff223562705be42c5d57e06e76e6e8814b5366c120444fca529fe204187
7
- data.tar.gz: acec67f2d4b5be3b660c2c6e8d90ef772925903f053f0a62965fc61f335a4fc84bbeee47265c49a596945e846d31ecad3b5a93602521037f91ed689c402712d8
6
+ metadata.gz: eb5c0c00f8fcdbbd6d8a1999d7544fc584c701fdf9a8a9c271fff6d9795f75ef9cab058fee2c6829808a764c892cc3e2f4e4a8717155d34d6514b46d744e632c
7
+ data.tar.gz: b8fb4ad34d7b93d47388f037d1d93e9e7245303740bc04d58d21942112ff97315e5dd31fdba77e275b52ddba85ca1055b897646ae1e606daee485583d50c52a6
data/CHANGELOG.md CHANGED
@@ -2,7 +2,22 @@
2
2
 
3
3
  ## [Unreleased](https://github.com/BoxcarsAI/boxcars/tree/HEAD)
4
4
 
5
- [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...HEAD)
5
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.7...HEAD)
6
+
7
+ **Closed issues:**
8
+
9
+ - Getting the same verbosity as in the examples [\#54](https://github.com/BoxcarsAI/boxcars/issues/54)
10
+
11
+ **Merged pull requests:**
12
+
13
+ - Add Engine for Gpt4all [\#55](https://github.com/BoxcarsAI/boxcars/pull/55) ([francis](https://github.com/francis))
14
+ - update google search to return URL for result if present [\#53](https://github.com/BoxcarsAI/boxcars/pull/53) ([francis](https://github.com/francis))
15
+ - Draft: added gpt4all [\#49](https://github.com/BoxcarsAI/boxcars/pull/49) ([jaigouk](https://github.com/jaigouk))
16
+ - Embeddings with hnswlib [\#48](https://github.com/BoxcarsAI/boxcars/pull/48) ([jaigouk](https://github.com/jaigouk))
17
+
18
+ ## [v0.2.7](https://github.com/BoxcarsAI/boxcars/tree/v0.2.7) (2023-04-13)
19
+
20
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...v0.2.7)
6
21
 
7
22
  **Closed issues:**
8
23
 
data/Gemfile CHANGED
@@ -30,3 +30,5 @@ gem "faraday-retry", "~> 2.0"
30
30
  gem "activesupport", "~> 7.0"
31
31
 
32
32
  gem "rest-client", "~> 2.1"
33
+
34
+ gem "hnswlib", "~> 0.8.1"
data/Gemfile.lock CHANGED
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- boxcars (0.2.7)
4
+ boxcars (0.2.8)
5
5
  google_search_results (~> 2.2)
6
+ gpt4all (~> 0.0.4)
6
7
  ruby-openai (~> 3.0)
7
8
 
8
9
  GEM
@@ -71,7 +72,12 @@ GEM
71
72
  rainbow (>= 2.2.1)
72
73
  rake (>= 10.0)
73
74
  google_search_results (2.2.0)
75
+ gpt4all (0.0.5)
76
+ faraday (~> 2.7)
77
+ os (~> 1.1)
78
+ tty-progressbar (~> 0.18.2)
74
79
  hashdiff (1.0.1)
80
+ hnswlib (0.8.1)
75
81
  http-accept (1.7.0)
76
82
  http-cookie (1.0.5)
77
83
  domain_name (~> 0.5)
@@ -100,6 +106,7 @@ GEM
100
106
  octokit (4.25.1)
101
107
  faraday (>= 1, < 3)
102
108
  sawyer (~> 0.9)
109
+ os (1.1.4)
103
110
  parallel (1.22.1)
104
111
  parser (3.2.1.1)
105
112
  ast (~> 2.4.1)
@@ -163,10 +170,19 @@ GEM
163
170
  faraday (>= 0.17.3, < 3)
164
171
  sqlite3 (1.6.2)
165
172
  mini_portile2 (~> 2.8.0)
173
+ sqlite3 (1.6.2-arm64-darwin)
166
174
  sqlite3 (1.6.2-x86_64-darwin)
167
175
  sqlite3 (1.6.2-x86_64-linux)
176
+ strings-ansi (0.2.0)
168
177
  timers (4.3.5)
169
178
  traces (0.9.1)
179
+ tty-cursor (0.7.1)
180
+ tty-progressbar (0.18.2)
181
+ strings-ansi (~> 0.2)
182
+ tty-cursor (~> 0.7)
183
+ tty-screen (~> 0.8)
184
+ unicode-display_width (>= 1.6, < 3.0)
185
+ tty-screen (0.8.1)
170
186
  tzinfo (2.0.6)
171
187
  concurrent-ruby (~> 1.0)
172
188
  unf (0.1.4)
@@ -181,6 +197,7 @@ GEM
181
197
  hashdiff (>= 0.4.0, < 2.0.0)
182
198
 
183
199
  PLATFORMS
200
+ arm64-darwin-22
184
201
  universal-java-11
185
202
  x86_64-darwin-21
186
203
  x86_64-darwin-22
@@ -194,6 +211,7 @@ DEPENDENCIES
194
211
  dotenv (~> 2.8)
195
212
  faraday-retry (~> 2.0)
196
213
  github_changelog_generator (~> 1.16)
214
+ hnswlib (~> 0.8.1)
197
215
  rake (~> 13.0)
198
216
  rest-client (~> 2.1)
199
217
  rspec (~> 3.2)
data/README.md CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  <h4 align="center">
4
4
  <a href="https://www.boxcars.ai">Website</a> |
5
- <a href="https://www.boxcars.ai/roadmap">Roadmap</a> |
6
5
  <a href="https://www.boxcars.ai/blog">Blog</a> |
7
6
  <a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
8
7
  </h4>
data/boxcars.gemspec CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
37
37
 
38
38
  # runtime dependencies
39
39
  spec.add_dependency "google_search_results", "~> 2.2"
40
+ spec.add_dependency "gpt4all", "~> 0.0.4"
40
41
  spec.add_dependency "ruby-openai", "~> 3.0"
41
42
 
42
43
  # For more information and examples about making a new gem, checkout our
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module Embeddings
5
+ class Document
6
+ attr_accessor :page_content, :metadata
7
+
8
+ def initialize(fields = {})
9
+ @page_content = fields[:page_content] || ""
10
+ @metadata = fields[:metadata] || {}
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'openai'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ class EmbedViaOpenAI
8
+ include Embeddings
9
+
10
+ attr_accessor :texts, :openai_connection, :model
11
+
12
+ def initialize(texts:, openai_connection:, model: 'text-embedding-ada-002')
13
+ validate_params(texts, openai_connection)
14
+ @texts = texts
15
+ @openai_connection = openai_connection
16
+ @model = model
17
+ end
18
+
19
+ def call
20
+ texts.map do |text|
21
+ embedding = embedding_with_retry(model: model, input: strip_new_lines(text))
22
+ {
23
+ embedding: embedding,
24
+ dim: embedding.size
25
+ }
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def validate_params(texts, openai_connection)
32
+ raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
33
+ raise_error 'openai_connection must be an OpenAI::Client' unless openai_connection.is_a?(OpenAI::Client)
34
+ end
35
+
36
+ def embedding_with_retry(request)
37
+ response = @openai_connection.embeddings(parameters: request)
38
+ response['data'][0]['embedding']
39
+ end
40
+
41
+ def strip_new_lines(text)
42
+ text.gsub("\n", ' ')
43
+ end
44
+
45
+ def raise_error(message)
46
+ raise ::Boxcars::ValueError, message
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'hnswlib'
5
+ require 'json'
6
+
7
+ module Boxcars
8
+ module Embeddings
9
+ module Hnswlib
10
+ class BuildVectorStore
11
+ include Embeddings
12
+
13
+ # This class is responsible for building the vector store for the hnswlib similarity search.
14
+ # It will load the training data, generate the embeddings, and save the vector store.
15
+ # It will also load the vector store into memory.
16
+ # For later use, it will save the splitted document with index numbers to a json file.
17
+ #
18
+ # @param training_data_path [String] The path to the training data. Can be a glob pattern.
19
+ # @param index_file_path [String] The path to the index file.
20
+ # @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
21
+ # @option json_doc_file_path [String]. The json file containing the document text.
22
+ # if nil, it will reuse index file name.
23
+ # @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
24
+ def initialize(
25
+ training_data_path:,
26
+ index_file_path:,
27
+ split_chunk_size: 2000,
28
+ json_doc_file_path: nil,
29
+ force_rebuild: true
30
+ )
31
+ @training_data_path = training_data_path
32
+ @index_file_path = index_file_path
33
+ @split_chunk_size = split_chunk_size
34
+ @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
35
+ @force_rebuild = force_rebuild
36
+ end
37
+
38
+ def call
39
+ validate_params
40
+ data = load_files
41
+ documents = split_text_into_chunks(data)
42
+ embeddings_with_config = generate_embeddings(documents)
43
+ save_vector_store(embeddings_with_config)
44
+ load_hnsw
45
+ end
46
+
47
+ private
48
+
49
+ attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
50
+
51
+ def validate_params
52
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
53
+ raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
54
+ raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
55
+
56
+ index_dir = File.dirname(index_file_path)
57
+ raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
58
+
59
+ raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
60
+ end
61
+
62
+ def load_files
63
+ data = []
64
+ files = Dir.glob(training_data_path)
65
+ raise_error "No files found at #{training_data_path}" if files.empty?
66
+
67
+ files.each do |file|
68
+ data << File.read(file)
69
+ end
70
+ puts "Added #{files.length} files to data. Splitting text into chunks..."
71
+ data
72
+ end
73
+
74
+ def split_text_into_chunks(data)
75
+ return true unless rebuild_required?
76
+
77
+ docs = []
78
+ data.each do |chunk|
79
+ doc_output = Boxcars::Embeddings::SplitText.call(
80
+ separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
81
+ )
82
+ docs.concat(doc_output)
83
+ end
84
+ docs
85
+ end
86
+
87
+ def rebuild_required?
88
+ hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
89
+ return true unless File.exist?(index_file_path)
90
+ return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
91
+ return true if force_rebuild
92
+
93
+ false
94
+ end
95
+
96
+ def generate_embeddings(documents)
97
+ return true unless rebuild_required?
98
+
99
+ puts "Initializing Store..."
100
+ openai_client = OpenAI::Client.new(access_token: ENV.fetch('OPENAI_API_KEY', nil))
101
+
102
+ embeddings_with_dim = Boxcars::Embeddings::EmbedViaOpenAI.call(texts: documents, openai_connection: openai_client)
103
+
104
+ document_embeddings = embeddings_with_dim.map.with_index do |item, index|
105
+ { doc_id: index, embedding: item[:embedding], document: documents[index] }
106
+ end
107
+
108
+ { document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
109
+ end
110
+
111
+ def save_vector_store(embeddings_with_config)
112
+ return true unless rebuild_required?
113
+
114
+ puts "Saving Vectorstore"
115
+ Boxcars::Embeddings::Hnswlib::SaveToHnswlib.call(
116
+ document_embeddings: embeddings_with_config[:document_embeddings],
117
+ index_file_path: index_file_path,
118
+ json_doc_file_path: json_doc_file_path,
119
+ hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
120
+ )
121
+ puts "VectorStore saved"
122
+ end
123
+
124
+ def hnswlib_config(dim)
125
+ # dim: length of datum point vector that will be indexed.
126
+ Boxcars::Embeddings::Hnswlib::HnswlibConfig.new(
127
+ metric: "l2", max_item: 10000, dim: dim
128
+ )
129
+ end
130
+
131
+ def load_hnsw
132
+ puts "Loading Hnswlib"
133
+
134
+ config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
135
+ json_config = parse_json_file(config_file)
136
+ document_embeddings = parse_json_file(json_doc_file_path)
137
+
138
+ search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
139
+ search_index.load_index(index_file_path)
140
+
141
+ { vector_store: search_index, document_embeddings: document_embeddings }
142
+ end
143
+
144
+ def parse_json_file(file_path)
145
+ return [] if file_path.nil?
146
+
147
+ file_content = File.read(file_path)
148
+ JSON.parse(file_content, symbolize_names: true)
149
+ rescue JSON::ParserError => e
150
+ raise_error("Error parsing hnswlib_config.json: #{e.message}")
151
+ end
152
+
153
+ def raise_error(message)
154
+ raise ::Boxcars::Error, message
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ module Hnswlib
8
+ class HnswlibConfig
9
+ attr_reader :metric, :max_item, :dim, :ef_construction, :m
10
+
11
+ # used for search index.
12
+ #
13
+ # @param max_item [Integer] The maximum number of items.
14
+ #
15
+ # @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
16
+ #
17
+ # @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
18
+ # It controls the index time/accuracy trade-off.
19
+ #
20
+ # @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
21
+ #
22
+ # reference: https://yoshoku.github.io/hnswlib.rb/doc/
23
+ def initialize(
24
+ metric: "l2",
25
+ max_item: 10000,
26
+ dim: 2,
27
+ ef_construction: 200,
28
+ max_outgoing_connection: 16
29
+ )
30
+ @metric = metric
31
+ @max_item = max_item
32
+ @dim = dim
33
+ @ef_construction = ef_construction
34
+ @max_outgoing_connection = max_outgoing_connection
35
+ end
36
+
37
+ def space
38
+ @metric == 'dot' ? 'ip' : 'l2'
39
+ end
40
+
41
+ def to_json(*args)
42
+ JSON.pretty_generate(
43
+ {
44
+ metric: @metric,
45
+ max_item: @max_item,
46
+ dim: @dim,
47
+ ef_construction: @ef_construction,
48
+ max_outgoing_connection: @max_outgoing_connection
49
+ },
50
+ *args
51
+ )
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+ require 'json'
5
+
6
+ module Boxcars
7
+ module Embeddings
8
+ module Hnswlib
9
+ class HnswlibSearch
10
+ def initialize(vector_store:, options: {})
11
+ validate_params(vector_store)
12
+ @vector_store = vector_store
13
+ @json_doc_path = options[:json_doc_path]
14
+ @num_neighbors = options[:num_neighbors] || 1
15
+ end
16
+
17
+ def call(query)
18
+ search(query)
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :json_doc_path, :vector_store, :num_neighbors
24
+
25
+ def validate_params(vector_store)
26
+ raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
27
+ end
28
+
29
+ def search(query)
30
+ raw_results = vector_store.search_knn(query, num_neighbors)
31
+ raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
32
+ end
33
+
34
+ def lookup_embedding2(doc_id, distance)
35
+ embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
36
+ return unless embedding_data
37
+
38
+ { document: embedding_data[:document], distance: distance }
39
+ end
40
+
41
+ def parsed_data
42
+ @parsed_data ||= JSON.parse(
43
+ File.read(json_doc_path),
44
+ symbolize_names: true
45
+ )
46
+ end
47
+
48
+ def raise_error(message)
49
+ raise ArgumentError, message
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+ require 'json'
5
+ require 'fileutils'
6
+
7
+ module Boxcars
8
+ module Embeddings
9
+ module Hnswlib
10
+ class SaveToHnswlib
11
+ include Embeddings
12
+
13
+ # @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
14
+ # @param index_file_path [String] The path to the index file.
15
+ # @param hnswlib_config [Boxcars::Embeddings::Hnswlib::Config] The config object for the hnswlib index.
16
+ # @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
17
+ def initialize(document_embeddings:, index_file_path:, hnswlib_config:, json_doc_file_path: nil)
18
+ @document_embeddings = document_embeddings
19
+ @index_file_path = index_file_path
20
+ @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
21
+
22
+ @hnswlib_config = hnswlib_config
23
+ @index = ::Hnswlib::HnswIndex.new(
24
+ n_features: hnswlib_config.dim,
25
+ max_item: hnswlib_config.max_item,
26
+ metric: hnswlib_config.metric
27
+ )
28
+ end
29
+
30
+ def call
31
+ validate_params
32
+ document_texts = []
33
+
34
+ document_embeddings.each do |embedding|
35
+ index.add_item(embedding[:doc_id], embedding[:embedding])
36
+
37
+ document_texts << { doc_id: embedding[:doc_id], embedding: embedding[:embedding], document: embedding[:document] }
38
+ end
39
+
40
+ write_files(index, document_texts)
41
+ end
42
+
43
+ private
44
+
45
+ def write_files(index, document_texts)
46
+ FileUtils.mkdir_p(File.dirname(json_doc_file_path))
47
+ File.write(json_doc_file_path, document_texts.to_json)
48
+
49
+ FileUtils.mkdir_p(File.dirname(index_file_path))
50
+ File.write("#{File.dirname(index_file_path)}/hnswlib_config.json", hnswlib_config.to_json)
51
+
52
+ index.save(index_file_path)
53
+ end
54
+
55
+ attr_reader :index, :document_embeddings, :index_file_path, :json_doc_file_path, :hnswlib_config
56
+
57
+ def validate_params
58
+ raise_error("document_embeddings must be an array") unless document_embeddings.is_a?(Array)
59
+ raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
60
+ raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
61
+
62
+ [index_file_path, json_doc_file_path].each do |path|
63
+ check_parent_directory(path)
64
+ end
65
+ end
66
+
67
+ def check_parent_directory(path)
68
+ return unless path
69
+
70
+ parent_dir = File.dirname(path)
71
+ raise_error('parent directory must exist') unless File.directory?(parent_dir)
72
+ end
73
+
74
+ def raise_error(message)
75
+ raise ::Boxcars::ValueError, message
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ class SimilaritySearch
8
+ def initialize(embeddings:, vector_store:, openai_connection:)
9
+ @embeddings = embeddings
10
+ @vector_store = vector_store
11
+ @similarity_search_instance = create_similarity_search_instance
12
+ @openai_connection = openai_connection
13
+ end
14
+
15
+ def call(query:)
16
+ validate_query(query)
17
+ query_vector = convert_query_to_vector(query)
18
+ @similarity_search_instance.call(query_vector)
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :embeddings, :vector_store, :openai_connection
24
+
25
+ def validate_query(query)
26
+ raise_error 'query must be a string' unless query.is_a?(String)
27
+ raise_error 'query must not be empty' if query.empty?
28
+ end
29
+
30
+ def convert_query_to_vector(query)
31
+ Boxcars::Embeddings::EmbedViaOpenAI.call(texts: [query], openai_connection: openai_connection).first[:embedding]
32
+ end
33
+
34
+ def create_similarity_search_instance
35
+ case vector_store
36
+ when ::Hnswlib::HierarchicalNSW
37
+ Boxcars::Embeddings::Hnswlib::HnswlibSearch.new(
38
+ vector_store: vector_store,
39
+ options: { json_doc_path: embeddings, num_neighbors: 2 }
40
+ )
41
+ else
42
+ raise_error 'Unsupported vector store provided'
43
+ end
44
+ end
45
+
46
+ def raise_error(message)
47
+ raise ArgumentError, message
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module Embeddings
5
+ # Split a text into chunks of a given size.
6
+ class SplitText
7
+ include Embeddings
8
+
9
+ attr_reader :separator, :chunk_size, :chunk_overlap, :text
10
+
11
+ # @param separator [String] The string to use to split the text.
12
+ # @param chunk_size [Integer] The size of each chunk.
13
+ # @param chunk_overlap [Integer] The amount of overlap between chunks.
14
+ # @param text [String] The text to split.
15
+ def initialize(separator: "Search", chunk_size: 7, chunk_overlap: 3, text: "")
16
+ # require 'debugger'; debugger
17
+ validate_params(separator, chunk_size, chunk_overlap, text)
18
+
19
+ @separator = separator
20
+ @chunk_size = chunk_size
21
+ @chunk_overlap = chunk_overlap
22
+ @text = text
23
+ end
24
+
25
+ def call
26
+ splits = text.split(separator)
27
+ merged_splits = merge_splits(splits, separator)
28
+
29
+ merged_splits&.sort
30
+ end
31
+
32
+ private
33
+
34
+ def validate_params(separator, chunk_size, chunk_overlap, text)
35
+ raise_error("separator must be a string") unless separator.is_a?(String)
36
+ raise_error("chunk_size must be an integer") unless chunk_size.is_a?(Integer)
37
+ raise_error("chunk_overlap must be an integer") unless chunk_overlap.is_a?(Integer)
38
+ raise_error("text must be a string") unless text.is_a?(String)
39
+ raise_error("chunk_overlap must be less than chunk_size") if chunk_overlap >= chunk_size
40
+ end
41
+
42
+ def raise_error(message)
43
+ raise ::Boxcars::ValueError, message
44
+ end
45
+
46
+ def merge_splits(splits, separator)
47
+ merged_splits = []
48
+ current_doc = []
49
+ total = 0
50
+
51
+ splits.each do |split|
52
+ split_len = split.length
53
+ total = process_split(total, split_len, current_doc, merged_splits, separator)
54
+ current_doc << split
55
+ total += split_len
56
+ end
57
+
58
+ add_remaining_doc(current_doc, merged_splits, separator)
59
+ merged_splits
60
+ end
61
+
62
+ def process_split(total, split_len, current_doc, merged_splits, separator)
63
+ if total + split_len >= chunk_size
64
+ warn_if_chunk_too_large(total)
65
+ total = handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
66
+ end
67
+ total
68
+ end
69
+
70
+ def warn_if_chunk_too_large(total)
71
+ return unless total > chunk_size
72
+
73
+ puts "Created a chunk of size #{total}, which is longer than the specified #{chunk_size}"
74
+ end
75
+
76
+ def handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
77
+ if current_doc.length.positive?
78
+ doc = join_docs(current_doc, separator)
79
+ merged_splits << doc if doc
80
+ total = remove_overlap(total, split_len, current_doc)
81
+ end
82
+ total
83
+ end
84
+
85
+ def remove_overlap(total, split_len, current_doc)
86
+ while total > chunk_overlap || (total + split_len > chunk_size && total.positive?)
87
+ total -= current_doc[0].length
88
+ current_doc.shift
89
+ end
90
+ total
91
+ end
92
+
93
+ def add_remaining_doc(current_doc, merged_splits, separator)
94
+ doc = join_docs(current_doc, separator)
95
+ merged_splits << doc if doc
96
+ end
97
+
98
+ def join_docs(docs, separator)
99
+ text = docs.join(separator).strip
100
+ text.empty? ? nil : text
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Boxcars is a framework for running a series of tools to get an answer to a question.
4
+ module Boxcars
5
+ module Embeddings
6
+ module ClassMethods
7
+ EmbeddingsError = Class.new(StandardError)
8
+
9
+ def call(*args, **kw_args)
10
+ new(*args, **kw_args).call
11
+ end
12
+ end
13
+
14
+ def self.included(base)
15
+ base.extend(ClassMethods)
16
+
17
+ class << base
18
+ private :new
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ require_relative "embeddings/document"
25
+ require_relative "embeddings/embed_via_open_ai"
26
+ require_relative "embeddings/split_text"
27
+ require_relative "embeddings/similarity_search"
28
+ require_relative "embeddings/hnswlib/hnswlib_config"
29
+ require_relative "embeddings/hnswlib/save_to_hnswlib"
30
+ require_relative "embeddings/hnswlib/build_vector_store"
31
+ require_relative "embeddings/hnswlib/hnswlib_search"
@@ -57,7 +57,15 @@ module Boxcars
57
57
  raise Error, "Got error from SerpAPI: {res[:error]}" if res[:error]
58
58
 
59
59
  ANSWER_LOCATIONS.each do |path|
60
- return res.dig(*path) if res.dig(*path)
60
+ next unless res.dig(*path)
61
+
62
+ Boxcars.debug("Found SERP answer at #{path}", :cyan)
63
+ path_link = path.dup
64
+ last_word = path_link.pop
65
+ path_link << :link
66
+ return { last_word => res.dig(*path), url: res.dig(*path_link) } if last_word.is_a?(Symbol) && res.dig(*path_link)
67
+
68
+ return res.dig(*path)
61
69
  end
62
70
  "No good search result found"
63
71
  end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Boxcars is a framework for running a series of tools to get an answer to a question.
4
+ module Boxcars
5
+ # For Boxcars that use an engine to do their work.
6
+ class Embedding < Boxcar
7
+ Error = Class.new(StandardError)
8
+ end
9
+ end
10
+
11
+ require "boxcars/boxcar/embeddings"
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'gpt4all'
4
+ # Boxcars is a framework for running a series of tools to get an answer to a question.
5
+ module Boxcars
6
+ # A engine that uses local GPT4All API.
7
+ class Gpt4allEng < Engine
8
+ attr_reader :prompts, :model_kwargs, :batch_size
9
+
10
+ # the default name of the engine
11
+ DEFAULT_NAME = "Gpt4all engine"
12
+ # the default description of the engine
13
+ DEFAULT_DESCRIPTION = "useful for when you need to use local AI to answer questions. " \
14
+ "You should ask targeted questions"
15
+
16
+ # A engine is a container for a single tool to run.
17
+ # @param name [String] The name of the engine. Defaults to "OpenAI engine".
18
+ # @param description [String] A description of the engine. Defaults to:
19
+ # useful for when you need to use AI to answer questions. You should ask targeted questions".
20
+ # @param prompts [Array<String>] The prompts to use when asking the engine. Defaults to [].
21
+ # @param batch_size [Integer] The number of prompts to send to the engine at once. Defaults to 2.
22
+ def initialize(name: DEFAULT_NAME, description: DEFAULT_DESCRIPTION, prompts: [], batch_size: 2, **_kwargs)
23
+ @prompts = prompts
24
+ @batch_size = batch_size
25
+ super(description: description, name: name)
26
+ end
27
+
28
+ # Get an answer from the engine.
29
+ # @param prompt [String] The prompt to use when asking the engine.
30
+ # @param openai_access_token [String] The access token to use when asking the engine.
31
+ # Defaults to Boxcars.configuration.openai_access_token.
32
+ # @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
33
+ def client(prompt:, inputs: {}, **_kwargs)
34
+ gpt4all = Gpt4all::ConversationalAI.new
35
+ gpt4all.prepare_resources(force_download: false)
36
+ gpt4all.start_bot
37
+ input_text = prompt.as_prompt(inputs)[:prompt]
38
+ Boxcars.debug("Prompt after formatting:\n#{input_text}", :cyan) if Boxcars.configuration.log_prompts
39
+ gpt4all.prompt(input_text)
40
+ rescue StandardError => e
41
+ Boxcars.error(["Error from gpt4all engine: #{e}", e.backtrace[-5..-1]].flatten.join("\n "))
42
+ ensure
43
+ gpt4all.stop_bot
44
+ end
45
+
46
+ # get an answer from the engine for a question.
47
+ # @param question [String] The question to ask the engine.
48
+ # @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
49
+ def run(question, **kwargs)
50
+ prompt = Prompt.new(template: question)
51
+ answer = client(prompt: prompt, **kwargs)
52
+ Boxcars.debug("Answer: #{answer}", :cyan)
53
+ answer
54
+ end
55
+ end
56
+ end
@@ -21,3 +21,4 @@ end
21
21
 
22
22
  require "boxcars/engine/engine_result"
23
23
  require "boxcars/engine/openai"
24
+ require "boxcars/engine/gpt4all_eng"
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Boxcars
4
4
  # The current version of the gem.
5
- VERSION = "0.2.7"
5
+ VERSION = "0.2.8"
6
6
  end
data/lib/boxcars.rb CHANGED
@@ -170,3 +170,4 @@ require "boxcars/ruby_repl"
170
170
  require "boxcars/engine"
171
171
  require "boxcars/boxcar"
172
172
  require "boxcars/train"
173
+ require "boxcars/embedding"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boxcars
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Sullivan
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-04-13 00:00:00.000000000 Z
12
+ date: 2023-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: debug
@@ -67,6 +67,20 @@ dependencies:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '2.2'
70
+ - !ruby/object:Gem::Dependency
71
+ name: gpt4all
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: 0.0.4
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: 0.0.4
70
84
  - !ruby/object:Gem::Dependency
71
85
  name: ruby-openai
72
86
  requirement: !ruby/object:Gem::Requirement
@@ -106,6 +120,15 @@ files:
106
120
  - lib/boxcars/boxcar.rb
107
121
  - lib/boxcars/boxcar/active_record.rb
108
122
  - lib/boxcars/boxcar/calculator.rb
123
+ - lib/boxcars/boxcar/embeddings.rb
124
+ - lib/boxcars/boxcar/embeddings/document.rb
125
+ - lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb
126
+ - lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb
127
+ - lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb
128
+ - lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb
129
+ - lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb
130
+ - lib/boxcars/boxcar/embeddings/similarity_search.rb
131
+ - lib/boxcars/boxcar/embeddings/split_text.rb
109
132
  - lib/boxcars/boxcar/engine_boxcar.rb
110
133
  - lib/boxcars/boxcar/google_search.rb
111
134
  - lib/boxcars/boxcar/sql.rb
@@ -113,8 +136,10 @@ files:
113
136
  - lib/boxcars/boxcar/wikipedia_search.rb
114
137
  - lib/boxcars/conversation.rb
115
138
  - lib/boxcars/conversation_prompt.rb
139
+ - lib/boxcars/embedding.rb
116
140
  - lib/boxcars/engine.rb
117
141
  - lib/boxcars/engine/engine_result.rb
142
+ - lib/boxcars/engine/gpt4all_eng.rb
118
143
  - lib/boxcars/engine/openai.rb
119
144
  - lib/boxcars/generation.rb
120
145
  - lib/boxcars/prompt.rb