langchainrb 0.3.12 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env.example +2 -1
- data/CHANGELOG.md +8 -0
- data/Gemfile.lock +10 -1
- data/README.md +10 -6
- data/lib/langchain/processors/csv.rb +21 -0
- data/lib/langchain/processors/json.rb +17 -0
- data/lib/langchain/processors/jsonl.rb +19 -0
- data/lib/langchain.rb +6 -2
- data/lib/vectorsearch/pgvector.rb +100 -0
- data/lib/version.rb +1 -1
- metadata +34 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ee811b2bac8fadea4d90c4212363a901829a4aac219da0f2a2dcbe7c6f59c5b
|
4
|
+
data.tar.gz: 8fa32e6df4aaf69cb6d29977913c1b8a30d6f65b777b1f90c8a7f504d869ca8f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cbb7e0c975333248c01082a47f7096fb9d6807c3b7619424eb9348238008d7b4257518287d9358114bf4e3a589349520ebf71ace00bf1fe8906afd27e8b1418a
|
7
|
+
data.tar.gz: 759444abe0b17518c6ef31fed6980f6bc0d3d096606860c4d6fddb8baeda4e0a23fc3909e42eba0f32912a786abec76cac54384533db2787e05d741f0907fa1d
|
data/.env.example
CHANGED
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
langchainrb (0.3.
|
4
|
+
langchainrb (0.3.13)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -148,9 +148,13 @@ GEM
|
|
148
148
|
milvus (0.9.1)
|
149
149
|
faraday (~> 1)
|
150
150
|
mini_mime (1.1.2)
|
151
|
+
mini_portile2 (2.8.2)
|
151
152
|
minitest (5.18.0)
|
152
153
|
multi_xml (0.6.0)
|
153
154
|
multipart-post (2.3.0)
|
155
|
+
nokogiri (1.14.3)
|
156
|
+
mini_portile2 (~> 2.8.0)
|
157
|
+
racc (~> 1.4)
|
154
158
|
nokogiri (1.14.3-arm64-darwin)
|
155
159
|
racc (~> 1.4)
|
156
160
|
nokogiri (1.14.3-x86_64-darwin)
|
@@ -166,6 +170,8 @@ GEM
|
|
166
170
|
hashery (~> 2.0)
|
167
171
|
ruby-rc4
|
168
172
|
ttfunk
|
173
|
+
pg (1.5.3)
|
174
|
+
pgvector (0.1.1)
|
169
175
|
pinecone (0.1.71)
|
170
176
|
dry-struct (~> 1.6.0)
|
171
177
|
dry-validation (~> 1.10.0)
|
@@ -273,6 +279,7 @@ GEM
|
|
273
279
|
PLATFORMS
|
274
280
|
arm64-darwin-21
|
275
281
|
arm64-darwin-22
|
282
|
+
ruby
|
276
283
|
x86_64-darwin-19
|
277
284
|
x86_64-darwin-22
|
278
285
|
x86_64-linux
|
@@ -290,6 +297,8 @@ DEPENDENCIES
|
|
290
297
|
milvus (~> 0.9.0)
|
291
298
|
nokogiri (~> 1.13)
|
292
299
|
pdf-reader (~> 1.4)
|
300
|
+
pg (~> 1.5)
|
301
|
+
pgvector (< 0.2)
|
293
302
|
pinecone (~> 0.1.6)
|
294
303
|
pry-byebug (~> 3.10.0)
|
295
304
|
qdrant-ruby (~> 0.9.0)
|
data/README.md
CHANGED
@@ -284,12 +284,16 @@ Langchain::Loader.load('https://www.example.com/file.pdf')
|
|
284
284
|
|
285
285
|
##### Supported Formats
|
286
286
|
|
287
|
-
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
287
|
+
|
288
|
+
| Format | Pocessor | Gem Requirements |
|
289
|
+
| ------ | ---------------------------- | :--------------------------: |
|
290
|
+
| docx | Langchain::Processors::Docx | `gem "docx", "~> 0.8.0"` |
|
291
|
+
| html | Langchain::Processors::HTML | `gem "nokogiri", "~> 1.13"` |
|
292
|
+
| pdf | Langchain::Processors::PDF | `gem "pdf-reader", "~> 1.4"` |
|
293
|
+
| text | Langchain::Processors::Text | |
|
294
|
+
| JSON | Langchain::Processors::JSON | |
|
295
|
+
| JSONL | Langchain::Processors::JSONL | |
|
296
|
+
| csv | Langchain::Processors::CSV | |
|
293
297
|
|
294
298
|
## Examples
|
295
299
|
Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module Langchain
|
6
|
+
module Processors
|
7
|
+
class CSV < Base
|
8
|
+
EXTENSIONS = [".csv"]
|
9
|
+
CONTENT_TYPES = ["text/csv"]
|
10
|
+
|
11
|
+
# Parse the document and return the text
|
12
|
+
# @param [File] data
|
13
|
+
# @return [Array of Hash]
|
14
|
+
def parse(data)
|
15
|
+
::CSV.new(data.read).map do |row|
|
16
|
+
row.map(&:strip)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class JSON < Base
|
6
|
+
EXTENSIONS = [".json"]
|
7
|
+
CONTENT_TYPES = ["application/json"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [Hash]
|
12
|
+
def parse(data)
|
13
|
+
::JSON.parse(data.read)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class JSONL < Base
|
6
|
+
EXTENSIONS = [".jsonl"]
|
7
|
+
CONTENT_TYPES = ["application/jsonl", "application/json-lines", "application/jsonlines"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [Array of Hash]
|
12
|
+
def parse(data)
|
13
|
+
data.read.lines.map do |line|
|
14
|
+
::JSON.parse(line)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/langchain.rb
CHANGED
@@ -20,10 +20,13 @@ module Langchain
|
|
20
20
|
|
21
21
|
module Processors
|
22
22
|
autoload :Base, "langchain/processors/base"
|
23
|
-
autoload :
|
23
|
+
autoload :CSV, "langchain/processors/csv"
|
24
|
+
autoload :Docx, "langchain/processors/docx"
|
24
25
|
autoload :HTML, "langchain/processors/html"
|
26
|
+
autoload :JSON, "langchain/processors/json"
|
27
|
+
autoload :JSONL, "langchain/processors/jsonl"
|
28
|
+
autoload :PDF, "langchain/processors/pdf"
|
25
29
|
autoload :Text, "langchain/processors/text"
|
26
|
-
autoload :Docx, "langchain/processors/docx"
|
27
30
|
end
|
28
31
|
end
|
29
32
|
|
@@ -37,6 +40,7 @@ module Vectorsearch
|
|
37
40
|
autoload :Chroma, "vectorsearch/chroma"
|
38
41
|
autoload :Milvus, "vectorsearch/milvus"
|
39
42
|
autoload :Pinecone, "vectorsearch/pinecone"
|
43
|
+
autoload :Pgvector, "vectorsearch/pgvector"
|
40
44
|
autoload :Qdrant, "vectorsearch/qdrant"
|
41
45
|
autoload :Weaviate, "vectorsearch/weaviate"
|
42
46
|
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Vectorsearch
|
4
|
+
# The PostgreSQL vector search adapter
|
5
|
+
class Pgvector < Base
|
6
|
+
# @param url [String] The URL of the PostgreSQL database
|
7
|
+
# @param index_name [String] The name of the table to use for the index
|
8
|
+
# @param llm [String] The URL of the Language Layer API
|
9
|
+
# @param llm_api_key [String] The API key for the Language Layer API
|
10
|
+
# @param api_key [String] The API key for the Vectorsearch DB (not used for PostgreSQL)
|
11
|
+
def initialize(url:, index_name:, llm:, llm_api_key:, api_key: nil)
|
12
|
+
require "pg"
|
13
|
+
require "pgvector"
|
14
|
+
|
15
|
+
@client = ::PG.connect(url)
|
16
|
+
registry = ::PG::BasicTypeRegistry.new.define_default_types
|
17
|
+
::Pgvector::PG.register_vector(registry)
|
18
|
+
@client.type_map_for_results = PG::BasicTypeMapForResults.new(@client, registry: registry)
|
19
|
+
|
20
|
+
@index_name = index_name
|
21
|
+
|
22
|
+
super(llm: llm, llm_api_key: llm_api_key)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add a list of texts to the index
|
26
|
+
# @param texts [Array<String>] The texts to add to the index
|
27
|
+
# @return [PG::Result] The response from the database
|
28
|
+
def add_texts(texts:)
|
29
|
+
data = texts.flat_map do |text|
|
30
|
+
[text, llm_client.embed(text: text)]
|
31
|
+
end
|
32
|
+
values = texts.length.times.map { |i| "($#{2 * i + 1}, $#{2 * i + 2})" }.join(",")
|
33
|
+
client.exec_params(
|
34
|
+
"INSERT INTO #{@index_name} (content, vectors) VALUES #{values};",
|
35
|
+
data
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Create default schema
|
40
|
+
# @return [PG::Result] The response from the database
|
41
|
+
def create_default_schema
|
42
|
+
client.exec("CREATE EXTENSION IF NOT EXISTS vector;")
|
43
|
+
client.exec(
|
44
|
+
<<~SQL
|
45
|
+
CREATE TABLE IF NOT EXISTS #{@index_name} (
|
46
|
+
id serial PRIMARY KEY,
|
47
|
+
content TEXT,
|
48
|
+
vectors VECTOR(#{default_dimension})
|
49
|
+
);
|
50
|
+
SQL
|
51
|
+
)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Search for similar texts in the index
|
55
|
+
# @param query [String] The text to search for
|
56
|
+
# @param k [Integer] The number of top results to return
|
57
|
+
# @return [Array<Hash>] The results of the search
|
58
|
+
def similarity_search(query:, k: 4)
|
59
|
+
embedding = llm_client.embed(text: query)
|
60
|
+
|
61
|
+
similarity_search_by_vector(
|
62
|
+
embedding: embedding,
|
63
|
+
k: k
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Search for similar texts in the index by the passed in vector.
|
68
|
+
# You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.
|
69
|
+
# @param embedding [Array<Float>] The vector to search for
|
70
|
+
# @param k [Integer] The number of top results to return
|
71
|
+
# @return [Array<Hash>] The results of the search
|
72
|
+
def similarity_search_by_vector(embedding:, k: 4)
|
73
|
+
result = client.transaction do |conn|
|
74
|
+
conn.exec("SET LOCAL ivfflat.probes = 10;")
|
75
|
+
query = <<~SQL
|
76
|
+
SELECT id, content FROM #{@index_name} ORDER BY vectors <-> $1 ASC LIMIT $2;
|
77
|
+
SQL
|
78
|
+
conn.exec_params(query, [embedding, k])
|
79
|
+
end
|
80
|
+
|
81
|
+
result.to_a
|
82
|
+
end
|
83
|
+
|
84
|
+
# Ask a question and return the answer
|
85
|
+
# @param question [String] The question to ask
|
86
|
+
# @return [String] The answer to the question
|
87
|
+
def ask(question:)
|
88
|
+
search_results = similarity_search(query: question)
|
89
|
+
|
90
|
+
context = search_results.map do |result|
|
91
|
+
result["content"].to_s
|
92
|
+
end
|
93
|
+
context = context.join("\n---\n")
|
94
|
+
|
95
|
+
prompt = generate_prompt(question: question, context: context)
|
96
|
+
|
97
|
+
llm_client.chat(prompt: prompt)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: langchainrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Bondarev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dotenv-rails
|
@@ -164,6 +164,34 @@ dependencies:
|
|
164
164
|
- - "~>"
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '1.13'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: pg
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - "~>"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '1.5'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - "~>"
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '1.5'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: pgvector
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - "<"
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0.2'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - "<"
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0.2'
|
167
195
|
- !ruby/object:Gem::Dependency
|
168
196
|
name: pdf-reader
|
169
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -290,8 +318,11 @@ files:
|
|
290
318
|
- lib/langchain.rb
|
291
319
|
- lib/langchain/loader.rb
|
292
320
|
- lib/langchain/processors/base.rb
|
321
|
+
- lib/langchain/processors/csv.rb
|
293
322
|
- lib/langchain/processors/docx.rb
|
294
323
|
- lib/langchain/processors/html.rb
|
324
|
+
- lib/langchain/processors/json.rb
|
325
|
+
- lib/langchain/processors/jsonl.rb
|
295
326
|
- lib/langchain/processors/pdf.rb
|
296
327
|
- lib/langchain/processors/text.rb
|
297
328
|
- lib/langchainrb.rb
|
@@ -312,6 +343,7 @@ files:
|
|
312
343
|
- lib/vectorsearch/base.rb
|
313
344
|
- lib/vectorsearch/chroma.rb
|
314
345
|
- lib/vectorsearch/milvus.rb
|
346
|
+
- lib/vectorsearch/pgvector.rb
|
315
347
|
- lib/vectorsearch/pinecone.rb
|
316
348
|
- lib/vectorsearch/qdrant.rb
|
317
349
|
- lib/vectorsearch/weaviate.rb
|