boxcars 0.7.7 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +6 -3
- data/.ruby-version +1 -1
- data/CHANGELOG.md +17 -0
- data/Gemfile +3 -13
- data/Gemfile.lock +30 -25
- data/POSTHOG_TEST_README.md +118 -0
- data/README.md +305 -0
- data/boxcars.gemspec +2 -2
- data/lib/boxcars/boxcar/active_record.rb +9 -10
- data/lib/boxcars/boxcar/calculator.rb +2 -2
- data/lib/boxcars/boxcar/engine_boxcar.rb +4 -4
- data/lib/boxcars/boxcar/google_search.rb +2 -2
- data/lib/boxcars/boxcar/json_engine_boxcar.rb +1 -1
- data/lib/boxcars/boxcar/ruby_calculator.rb +1 -1
- data/lib/boxcars/boxcar/sql_base.rb +4 -4
- data/lib/boxcars/boxcar/swagger.rb +3 -3
- data/lib/boxcars/boxcar/vector_answer.rb +3 -3
- data/lib/boxcars/boxcar/xml_engine_boxcar.rb +1 -1
- data/lib/boxcars/boxcar.rb +6 -6
- data/lib/boxcars/conversation_prompt.rb +3 -3
- data/lib/boxcars/engine/anthropic.rb +121 -23
- data/lib/boxcars/engine/cerebras.rb +2 -2
- data/lib/boxcars/engine/cohere.rb +135 -9
- data/lib/boxcars/engine/gemini_ai.rb +151 -76
- data/lib/boxcars/engine/google.rb +2 -2
- data/lib/boxcars/engine/gpt4all_eng.rb +92 -34
- data/lib/boxcars/engine/groq.rb +124 -73
- data/lib/boxcars/engine/intelligence_base.rb +52 -17
- data/lib/boxcars/engine/ollama.rb +127 -47
- data/lib/boxcars/engine/openai.rb +186 -103
- data/lib/boxcars/engine/perplexityai.rb +116 -136
- data/lib/boxcars/engine/together.rb +2 -2
- data/lib/boxcars/engine/unified_observability.rb +430 -0
- data/lib/boxcars/engine.rb +4 -3
- data/lib/boxcars/engines.rb +74 -0
- data/lib/boxcars/observability.rb +44 -0
- data/lib/boxcars/observability_backend.rb +17 -0
- data/lib/boxcars/observability_backends/multi_backend.rb +42 -0
- data/lib/boxcars/observability_backends/posthog_backend.rb +89 -0
- data/lib/boxcars/observation.rb +8 -8
- data/lib/boxcars/prompt.rb +16 -4
- data/lib/boxcars/result.rb +7 -12
- data/lib/boxcars/ruby_repl.rb +1 -1
- data/lib/boxcars/train/train_action.rb +1 -1
- data/lib/boxcars/train/xml_train.rb +3 -3
- data/lib/boxcars/train/xml_zero_shot.rb +1 -1
- data/lib/boxcars/train/zero_shot.rb +3 -3
- data/lib/boxcars/train.rb +1 -1
- data/lib/boxcars/vector_search.rb +5 -5
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +115 -88
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +105 -80
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +147 -122
- data/lib/boxcars/vector_store/pgvector/search.rb +156 -131
- data/lib/boxcars/vector_store.rb +4 -4
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +31 -20
- metadata +25 -21
@@ -1,96 +1,121 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
require '
|
5
|
-
require '
|
6
|
-
|
7
|
-
|
8
|
-
module
|
9
|
-
module
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
3
|
+
if Gem.loaded_specs.key?('pgvector') && Gem.loaded_specs.key?('pg')
|
4
|
+
require 'pgvector'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
module Boxcars
|
9
|
+
module VectorStore
|
10
|
+
module Pgvector
|
11
|
+
class BuildFromFiles
|
12
|
+
include VectorStore
|
13
|
+
|
14
|
+
# @param training_data_path [String] path to training data files
|
15
|
+
# @param split_chunk_size [Integer] number of characters to split the text into
|
16
|
+
# @param embedding_tool [Symbol] embedding tool to use
|
17
|
+
# @param database_url [String] database url
|
18
|
+
# @param table_name [String] table name
|
19
|
+
# @param embedding_column_name [String] embedding column name
|
20
|
+
# @param content_column_name [String] content column name
|
21
|
+
# @param metadata_column_name [String] metadata column name
|
22
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
23
|
+
def initialize(params)
|
24
|
+
@split_chunk_size = params[:split_chunk_size] || 2000
|
25
|
+
@training_data_path = File.absolute_path(params[:training_data_path])
|
26
|
+
@embedding_tool = params[:embedding_tool] || :openai
|
27
|
+
|
28
|
+
validate_params(embedding_tool, training_data_path)
|
29
|
+
|
30
|
+
@database_url = params[:database_url]
|
31
|
+
@table_name = params[:table_name]
|
32
|
+
@embedding_column_name = params[:embedding_column_name]
|
33
|
+
@content_column_name = params[:content_column_name]
|
34
|
+
@metadata_column_name = params[:metadata_column_name]
|
35
|
+
|
36
|
+
@pg_vectors = []
|
37
|
+
end
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
39
|
+
# @return [Hash] vector_store: array of Inventor::VectorStore::Document
|
40
|
+
def call
|
41
|
+
data = load_data_files(training_data_path)
|
42
|
+
texts = split_text_into_chunks(data)
|
43
|
+
embeddings = generate_vectors(texts)
|
44
|
+
add_vectors(embeddings, texts)
|
45
|
+
documents = save_vector_store
|
46
|
+
|
47
|
+
{
|
48
|
+
type: :pgvector,
|
49
|
+
vector_store: documents
|
50
|
+
}
|
51
|
+
end
|
51
52
|
|
52
|
-
|
53
|
+
private
|
53
54
|
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :database_url,
|
56
|
+
:table_name, :embedding_column_name, :content_column_name,
|
57
|
+
:metadata_column_name, :pg_vectors
|
57
58
|
|
58
|
-
|
59
|
-
|
59
|
+
def validate_params(embedding_tool, training_data_path)
|
60
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
60
61
|
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
raise_argument_error('training_data_path parent directory must exist') unless Dir.exist?(training_data_dir)
|
63
|
+
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
64
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
64
65
|
|
65
|
-
|
66
|
-
|
66
|
+
raise_argument_error('embedding_tool is invalid')
|
67
|
+
end
|
68
|
+
|
69
|
+
def add_vectors(vectors, texts)
|
70
|
+
vectors.map.with_index do |vector, index|
|
71
|
+
pg_vector = Document.new(
|
72
|
+
content: texts[index],
|
73
|
+
embedding: vector[:embedding],
|
74
|
+
metadata: {
|
75
|
+
doc_id: index,
|
76
|
+
training_data_path: training_data_path
|
77
|
+
}
|
78
|
+
)
|
79
|
+
pg_vectors << pg_vector
|
80
|
+
end
|
81
|
+
end
|
67
82
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
}
|
83
|
+
def save_vector_store
|
84
|
+
result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
|
85
|
+
pg_vectors: pg_vectors,
|
86
|
+
database_url: database_url,
|
87
|
+
table_name: table_name,
|
88
|
+
embedding_column_name: embedding_column_name,
|
89
|
+
content_column_name: content_column_name,
|
90
|
+
metadata_column_name: metadata_column_name
|
77
91
|
)
|
78
|
-
|
92
|
+
raise_argument_error('Error saving vector store to database.') unless result
|
93
|
+
|
94
|
+
result
|
79
95
|
end
|
80
96
|
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
else
|
101
|
+
# Define placeholder modules/classes that raise an error if pgvector is not available
|
102
|
+
module Boxcars
|
103
|
+
module VectorStore
|
104
|
+
module Pgvector
|
105
|
+
class PgvectorNotAvailableError < StandardError
|
106
|
+
def initialize(message = "The 'pgvector' and 'pg' gems are required. Please add them to your Gemfile.")
|
107
|
+
super
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class BuildFromFiles
|
112
|
+
def initialize(*_args)
|
113
|
+
raise PgvectorNotAvailableError
|
114
|
+
end
|
81
115
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
database_url: database_url,
|
86
|
-
table_name: table_name,
|
87
|
-
embedding_column_name: embedding_column_name,
|
88
|
-
content_column_name: content_column_name,
|
89
|
-
metadata_column_name: metadata_column_name
|
90
|
-
)
|
91
|
-
raise_argument_error('Error saving vector store to database.') unless result
|
92
|
-
|
93
|
-
result
|
116
|
+
def call(*_args)
|
117
|
+
raise PgvectorNotAvailableError
|
118
|
+
end
|
94
119
|
end
|
95
120
|
end
|
96
121
|
end
|
@@ -1,148 +1,173 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
require '
|
5
|
-
|
6
|
-
|
7
|
-
module
|
8
|
-
module
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
3
|
+
if Gem.loaded_specs.key?('pgvector') && Gem.loaded_specs.key?('pg')
|
4
|
+
require 'pg'
|
5
|
+
require 'pgvector'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module VectorStore
|
9
|
+
module Pgvector
|
10
|
+
class SaveToDatabase
|
11
|
+
include VectorStore
|
12
|
+
|
13
|
+
# @param pg_vectors [Array] array of Boxcars::VectorStore::Document
|
14
|
+
# @param database_url [String] database url
|
15
|
+
# @param table_name [String] table name
|
16
|
+
# @param embedding_column_name [String] embedding column name
|
17
|
+
# @param content_column_name [String] content column name
|
18
|
+
# @param metadata_column_name [String] metadata column name
|
19
|
+
# @return [Array] array of Boxcars::VectorStore::Document
|
20
|
+
def initialize(params)
|
21
|
+
validate_param_types(params)
|
22
|
+
@db_connection = test_db_params(params)
|
23
|
+
|
24
|
+
@table_name = params[:table_name]
|
25
|
+
@content_column_name = params[:content_column_name]
|
26
|
+
@embedding_column_name = params[:embedding_column_name]
|
27
|
+
@metadata_column_name = params[:metadata_column_name]
|
28
|
+
|
29
|
+
@pg_vectors = params[:pg_vectors]
|
30
|
+
end
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
32
|
+
# @return [Array] array of Boxcars::VectorStore::Document
|
33
|
+
def call
|
34
|
+
add_vectors_to_database
|
35
|
+
end
|
35
36
|
|
36
|
-
|
37
|
+
private
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
39
|
+
attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
|
40
|
+
:embedding_column_name, :content_column_name,
|
41
|
+
:metadata_column_name
|
41
42
|
|
42
|
-
|
43
|
-
|
43
|
+
def validate_param_types(params)
|
44
|
+
pg_vectors = params[:pg_vectors]
|
44
45
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
46
|
+
raise_argument_error('pg_vectors must be an array') unless pg_vectors.is_a?(Array)
|
47
|
+
raise_argument_error('missing data') if pg_vectors.empty?
|
48
|
+
raise_argument_error('invalid vector_store') unless valid_vector_store?(pg_vectors)
|
49
|
+
@database_url = params[:database_url]
|
50
|
+
raise_argument_error('missing database_url argument') if @database_url.to_s.empty?
|
51
|
+
end
|
51
52
|
|
52
|
-
|
53
|
-
|
54
|
-
|
53
|
+
def valid_vector_store?(pg_vectors)
|
54
|
+
pg_vectors.all? do |doc|
|
55
|
+
doc.is_a?(Boxcars::VectorStore::Document)
|
56
|
+
end
|
57
|
+
rescue TypeError => e
|
58
|
+
raise_argument_error(e.message)
|
55
59
|
end
|
56
|
-
rescue TypeError => e
|
57
|
-
raise_argument_error(e.message)
|
58
|
-
end
|
59
60
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
61
|
+
def test_db_params(params)
|
62
|
+
conn = ::PG::Connection.new(@database_url)
|
63
|
+
|
64
|
+
check_db_connection(conn)
|
65
|
+
check_vector_extension(conn)
|
66
|
+
check_table_exists(conn, params[:table_name])
|
67
|
+
check_column_exists(conn, params)
|
68
|
+
|
69
|
+
registry = PG::BasicTypeRegistry.new.define_default_types
|
70
|
+
::Pgvector::PG.register_vector(registry)
|
71
|
+
conn.type_map_for_queries = PG::BasicTypeMapForQueries.new(conn, registry: registry)
|
72
|
+
conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
|
73
|
+
conn
|
74
|
+
rescue PG::Error, NameError => e
|
75
|
+
raise_argument_error(e.message)
|
76
|
+
end
|
76
77
|
|
77
|
-
|
78
|
-
|
78
|
+
def check_db_connection(conn)
|
79
|
+
return if conn.status == PG::CONNECTION_OK
|
79
80
|
|
80
|
-
|
81
|
-
|
81
|
+
raise_argument_error("PostgreSQL connection is not ok")
|
82
|
+
end
|
82
83
|
|
83
|
-
|
84
|
-
|
84
|
+
def check_vector_extension(conn)
|
85
|
+
return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
|
85
86
|
|
86
|
-
|
87
|
-
|
87
|
+
raise_argument_error("PostgreSQL 'vector' extension is not installed")
|
88
|
+
end
|
89
|
+
|
90
|
+
def check_table_exists(conn, table_name)
|
91
|
+
table_exists = conn.exec_params(
|
92
|
+
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
|
93
|
+
).getvalue(0, 0) == "t"
|
94
|
+
return if table_exists
|
88
95
|
|
89
|
-
|
90
|
-
|
91
|
-
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
|
92
|
-
).getvalue(0, 0) == "t"
|
93
|
-
return if table_exists
|
96
|
+
raise_argument_error("Table '#{table_name}' does not exist")
|
97
|
+
end
|
94
98
|
|
95
|
-
|
96
|
-
|
99
|
+
def check_column_exists(conn, params)
|
100
|
+
column_names = %i[embedding_column_name content_column_name metadata_column_name]
|
101
|
+
table_name = params[:table_name]
|
97
102
|
|
98
|
-
|
99
|
-
|
100
|
-
|
103
|
+
column_names.each do |target|
|
104
|
+
column_name = params[target]
|
105
|
+
column_exists = conn.exec_params(
|
106
|
+
"SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
|
107
|
+
[table_name, column_name]
|
108
|
+
).getvalue(0, 0) == "t"
|
109
|
+
next if column_exists
|
101
110
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
"SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
|
106
|
-
[table_name, column_name]
|
107
|
-
).getvalue(0, 0) == "t"
|
108
|
-
next if column_exists
|
111
|
+
raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
|
112
|
+
end
|
113
|
+
end
|
109
114
|
|
110
|
-
|
115
|
+
def add_vectors_to_database
|
116
|
+
pg_vectors.each do |document|
|
117
|
+
embedding = document.embedding.map(&:to_f)
|
118
|
+
content = document.content
|
119
|
+
metadata = document.metadata.to_json
|
120
|
+
|
121
|
+
if document.metadata[:id]
|
122
|
+
id = document.metadata[:id]
|
123
|
+
# directly inserting table_name, embedding_column_name, and content_column_name
|
124
|
+
# into the SQL command. If these values are coming from an untrusted source,
|
125
|
+
# there is a risk of SQL injection
|
126
|
+
sql = <<-SQL
|
127
|
+
INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
|
128
|
+
VALUES ($1, $2, $3, $4)
|
129
|
+
ON CONFLICT (id) DO UPDATE
|
130
|
+
SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
|
131
|
+
#{content_column_name} = EXCLUDED.#{content_column_name},
|
132
|
+
#{metadata_column_name} = EXCLUDED.#{metadata_column_name}
|
133
|
+
SQL
|
134
|
+
# parameters are given separately from the SQL command,
|
135
|
+
# there's no risk of them being interpreted as part of the command.
|
136
|
+
db_connection.exec_params(sql, [id, embedding, content, metadata])
|
137
|
+
else
|
138
|
+
sql = <<-SQL
|
139
|
+
INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
|
140
|
+
VALUES ($1, $2, $3)
|
141
|
+
SQL
|
142
|
+
db_connection.exec_params(sql, [embedding, content, metadata])
|
143
|
+
end
|
144
|
+
end
|
145
|
+
rescue PG::Error => e
|
146
|
+
raise_argument_error(e.message)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
else
|
153
|
+
# Define placeholder modules/classes that raise an error if pgvector is not available
|
154
|
+
module Boxcars
|
155
|
+
module VectorStore
|
156
|
+
module Pgvector
|
157
|
+
class PgvectorNotAvailableError < StandardError
|
158
|
+
def initialize(message = "The 'pgvector' and 'pg' gems are required. Please add them to your Gemfile.")
|
159
|
+
super
|
111
160
|
end
|
112
161
|
end
|
113
162
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
id = document.metadata[:id]
|
122
|
-
# directly inserting table_name, embedding_column_name, and content_column_name
|
123
|
-
# into the SQL command. If these values are coming from an untrusted source,
|
124
|
-
# there is a risk of SQL injection
|
125
|
-
sql = <<-SQL
|
126
|
-
INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
|
127
|
-
VALUES ($1, $2, $3, $4)
|
128
|
-
ON CONFLICT (id) DO UPDATE
|
129
|
-
SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
|
130
|
-
#{content_column_name} = EXCLUDED.#{content_column_name},
|
131
|
-
#{metadata_column_name} = EXCLUDED.#{metadata_column_name}
|
132
|
-
SQL
|
133
|
-
# parameters are given separately from the SQL command,
|
134
|
-
# there's no risk of them being interpreted as part of the command.
|
135
|
-
db_connection.exec_params(sql, [id, embedding, content, metadata])
|
136
|
-
else
|
137
|
-
sql = <<-SQL
|
138
|
-
INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
|
139
|
-
VALUES ($1, $2, $3)
|
140
|
-
SQL
|
141
|
-
db_connection.exec_params(sql, [embedding, content, metadata])
|
142
|
-
end
|
163
|
+
class SaveToDatabase
|
164
|
+
def initialize(*_args)
|
165
|
+
raise PgvectorNotAvailableError
|
166
|
+
end
|
167
|
+
|
168
|
+
def call(*_args)
|
169
|
+
raise PgvectorNotAvailableError
|
143
170
|
end
|
144
|
-
rescue PG::Error => e
|
145
|
-
raise_argument_error(e.message)
|
146
171
|
end
|
147
172
|
end
|
148
173
|
end
|