langchainrb 0.8.1 → 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/langchain/chunker/markdown.rb +39 -0
- data/lib/langchain/data.rb +4 -3
- data/lib/langchain/llm/response/ollama_response.rb +1 -1
- data/lib/langchain/loader.rb +3 -2
- data/lib/langchain/processors/markdown.rb +17 -0
- data/lib/langchain/utils/token_length/ai21_validator.rb +4 -0
- data/lib/langchain/utils/token_length/base_validator.rb +1 -1
- data/lib/langchain/utils/token_length/cohere_validator.rb +4 -0
- data/lib/langchain/utils/token_length/google_palm_validator.rb +4 -0
- data/lib/langchain/utils/token_length/openai_validator.rb +41 -0
- data/lib/langchain/vectorsearch/base.rb +2 -2
- data/lib/langchain/vectorsearch/epsilla.rb +5 -1
- data/lib/langchain/vectorsearch/pinecone.rb +2 -2
- data/lib/langchain/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13eec34cc529732ddfb8994956659bd4307a79ebfd76ff883fe3b6644d647c24
|
4
|
+
data.tar.gz: ce04acfe42a6a8da5a5951734651dd0083f7d2efc43cf4b3367710c8221ee96a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2094d99610311a1583d890f8c6898605bcd3e76d2fb72deb1ccd4b250f2b98f7a883401faf2e161b97b82fb29f6e64ead8843d8af22f0bd3e8a4c872c150c134
|
7
|
+
data.tar.gz: d7ce155cbb992e651aa8dc468ed1ee39bd96d1457f50faa11a32d7caac87086f5d8a381fc2b50aaba10ac934486ed415d5e609f47ee0426b4187540e2436b2e9
|
data/CHANGELOG.md
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "baran"
|
4
|
+
|
5
|
+
module Langchain
|
6
|
+
module Chunker
|
7
|
+
#
|
8
|
+
# Simple text chunker
|
9
|
+
#
|
10
|
+
# Usage:
|
11
|
+
# Langchain::Chunker::Markdown.new(text).chunks
|
12
|
+
#
|
13
|
+
class Markdown < Base
|
14
|
+
attr_reader :text, :chunk_size, :chunk_overlap
|
15
|
+
|
16
|
+
# @param [String] text
|
17
|
+
# @param [Integer] chunk_size
|
18
|
+
# @param [Integer] chunk_overlap
|
19
|
+
# @param [String] separator
|
20
|
+
def initialize(text, chunk_size: 1000, chunk_overlap: 200)
|
21
|
+
@text = text
|
22
|
+
@chunk_size = chunk_size
|
23
|
+
@chunk_overlap = chunk_overlap
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Array<Langchain::Chunk>]
|
27
|
+
def chunks
|
28
|
+
splitter = Baran::MarkdownSplitter.new(
|
29
|
+
chunk_size: chunk_size,
|
30
|
+
chunk_overlap: chunk_overlap
|
31
|
+
)
|
32
|
+
|
33
|
+
splitter.chunks(text).map do |chunk|
|
34
|
+
Langchain::Chunk.new(text: chunk[:text])
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/langchain/data.rb
CHANGED
@@ -9,9 +9,10 @@ module Langchain
|
|
9
9
|
|
10
10
|
# @param data [String] data that was loaded
|
11
11
|
# @option options [String] :source URL or Path of the data source
|
12
|
-
def initialize(data,
|
13
|
-
@source =
|
12
|
+
def initialize(data, source: nil, chunker: Langchain::Chunker::Text)
|
13
|
+
@source = source
|
14
14
|
@data = data
|
15
|
+
@chunker_klass = chunker
|
15
16
|
end
|
16
17
|
|
17
18
|
# @return [String]
|
@@ -22,7 +23,7 @@ module Langchain
|
|
22
23
|
# @param opts [Hash] options passed to the chunker
|
23
24
|
# @return [Array<String>]
|
24
25
|
def chunks(opts = {})
|
25
|
-
|
26
|
+
@chunker_klass.new(@data, **opts).chunks
|
26
27
|
end
|
27
28
|
end
|
28
29
|
end
|
data/lib/langchain/loader.rb
CHANGED
@@ -37,9 +37,10 @@ module Langchain
|
|
37
37
|
# @param path [String | Pathname] path to file or URL
|
38
38
|
# @param options [Hash] options passed to the processor class used to process the data
|
39
39
|
# @return [Langchain::Loader] loader instance
|
40
|
-
def initialize(path, options = {})
|
40
|
+
def initialize(path, options = {}, chunker: Langchain::Chunker::Text)
|
41
41
|
@options = options
|
42
42
|
@path = path
|
43
|
+
@chunker = chunker
|
43
44
|
end
|
44
45
|
|
45
46
|
# Is the path a URL?
|
@@ -112,7 +113,7 @@ module Langchain
|
|
112
113
|
processor_klass.new(@options).parse(@raw_data)
|
113
114
|
end
|
114
115
|
|
115
|
-
Langchain::Data.new(result)
|
116
|
+
Langchain::Data.new(result, source: @options[:source], chunker: @chunker)
|
116
117
|
end
|
117
118
|
|
118
119
|
def processor_klass
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class Markdown < Base
|
6
|
+
EXTENSIONS = [".markdown", ".md"]
|
7
|
+
CONTENT_TYPES = ["text/markdown"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [String]
|
12
|
+
def parse(data)
|
13
|
+
data.read
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -31,6 +31,10 @@ module Langchain
|
|
31
31
|
TOKEN_LIMITS[model_name]
|
32
32
|
end
|
33
33
|
singleton_class.alias_method :completion_token_limit, :token_limit
|
34
|
+
|
35
|
+
def self.token_length_from_messages(messages, model_name, options)
|
36
|
+
messages.sum { |message| token_length(message.to_json, model_name, options) }
|
37
|
+
end
|
34
38
|
end
|
35
39
|
end
|
36
40
|
end
|
@@ -14,7 +14,7 @@ module Langchain
|
|
14
14
|
class BaseValidator
|
15
15
|
def self.validate_max_tokens!(content, model_name, options = {})
|
16
16
|
text_token_length = if content.is_a?(Array)
|
17
|
-
content
|
17
|
+
token_length_from_messages(content, model_name, options)
|
18
18
|
else
|
19
19
|
token_length(content, model_name, options)
|
20
20
|
end
|
@@ -39,6 +39,10 @@ module Langchain
|
|
39
39
|
TOKEN_LIMITS[model_name]
|
40
40
|
end
|
41
41
|
singleton_class.alias_method :completion_token_limit, :token_limit
|
42
|
+
|
43
|
+
def self.token_length_from_messages(messages, model_name, options)
|
44
|
+
messages.sum { |message| token_length(message.to_json, model_name, options) }
|
45
|
+
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
end
|
@@ -43,6 +43,10 @@ module Langchain
|
|
43
43
|
response.dig("tokenCount")
|
44
44
|
end
|
45
45
|
|
46
|
+
def self.token_length_from_messages(messages, model_name, options)
|
47
|
+
messages.sum { |message| token_length(message.to_json, model_name, options) }
|
48
|
+
end
|
49
|
+
|
46
50
|
def self.token_limit(model_name)
|
47
51
|
TOKEN_LIMITS.dig(model_name, "input_token_limit")
|
48
52
|
end
|
@@ -75,6 +75,47 @@ module Langchain
|
|
75
75
|
max_tokens = super(content, model_name, options)
|
76
76
|
[options[:max_tokens], max_tokens].reject(&:nil?).min
|
77
77
|
end
|
78
|
+
|
79
|
+
# Copied from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
|
80
|
+
# Return the number of tokens used by a list of messages
|
81
|
+
#
|
82
|
+
# @param messages [Array<Hash>] The messages to calculate the token length for
|
83
|
+
# @param model [String] The model name to validate against
|
84
|
+
# @return [Integer] The token length of the messages
|
85
|
+
#
|
86
|
+
def self.token_length_from_messages(messages, model_name, options = {})
|
87
|
+
encoding = Tiktoken.encoding_for_model(model_name)
|
88
|
+
|
89
|
+
if ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613"].include?(model_name)
|
90
|
+
tokens_per_message = 3
|
91
|
+
tokens_per_name = 1
|
92
|
+
elsif model_name == "gpt-3.5-turbo-0301"
|
93
|
+
tokens_per_message = 4 # every message follows {role/name}\n{content}\n
|
94
|
+
tokens_per_name = -1 # if there's a name, the role is omitted
|
95
|
+
elsif model_name.include?("gpt-3.5-turbo")
|
96
|
+
puts "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613."
|
97
|
+
return token_length_from_messages(messages, "gpt-3.5-turbo-0613", options)
|
98
|
+
elsif model_name.include?("gpt-4")
|
99
|
+
puts "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613."
|
100
|
+
return token_length_from_messages(messages, "gpt-4-0613", options)
|
101
|
+
else
|
102
|
+
raise NotImplementedError.new(
|
103
|
+
"token_length_from_messages() is not implemented for model #{model_name}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."
|
104
|
+
)
|
105
|
+
end
|
106
|
+
|
107
|
+
num_tokens = 0
|
108
|
+
messages.each do |message|
|
109
|
+
num_tokens += tokens_per_message
|
110
|
+
message.each do |key, value|
|
111
|
+
num_tokens += encoding.encode(value).length
|
112
|
+
num_tokens += tokens_per_name if ["name", :name].include?(key)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
num_tokens += 3 # every reply is primed with assistant
|
117
|
+
num_tokens
|
118
|
+
end
|
78
119
|
end
|
79
120
|
end
|
80
121
|
end
|
@@ -175,13 +175,13 @@ module Langchain::Vectorsearch
|
|
175
175
|
prompt_template.format(question: question, context: context)
|
176
176
|
end
|
177
177
|
|
178
|
-
def add_data(paths:)
|
178
|
+
def add_data(paths:, options: {}, chunker: Langchain::Chunker::Text)
|
179
179
|
raise ArgumentError, "Paths must be provided" if Array(paths).empty?
|
180
180
|
|
181
181
|
texts = Array(paths)
|
182
182
|
.flatten
|
183
183
|
.map do |path|
|
184
|
-
data = Langchain::Loader.new(path)&.load&.chunks
|
184
|
+
data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
|
185
185
|
data.map { |chunk| chunk.text }
|
186
186
|
end
|
187
187
|
|
@@ -36,7 +36,11 @@ module Langchain::Vectorsearch
|
|
36
36
|
status_code, response = @client.database.load_db(db_name, db_path)
|
37
37
|
|
38
38
|
if status_code != 200
|
39
|
-
if status_code == 500 && response["message"].include?("already loaded")
|
39
|
+
if status_code == 409 || (status_code == 500 && response["message"].include?("already loaded"))
|
40
|
+
# When db is already loaded, Epsilla may return HTTP 409 Conflict.
|
41
|
+
# This behavior is changed in https://github.com/epsilla-cloud/vectordb/pull/95
|
42
|
+
# Old behavior (HTTP 500) is preserved for backwards compatibility.
|
43
|
+
# It does not prevent us from using the db.
|
40
44
|
Langchain.logger.info("Database already loaded")
|
41
45
|
else
|
42
46
|
raise "Failed to load database: #{response}"
|
@@ -64,13 +64,13 @@ module Langchain::Vectorsearch
|
|
64
64
|
index.upsert(vectors: vectors, namespace: namespace)
|
65
65
|
end
|
66
66
|
|
67
|
-
def add_data(paths:, namespace: "")
|
67
|
+
def add_data(paths:, namespace: "", options: {}, chunker: Langchain::Chunker::Text)
|
68
68
|
raise ArgumentError, "Paths must be provided" if Array(paths).empty?
|
69
69
|
|
70
70
|
texts = Array(paths)
|
71
71
|
.flatten
|
72
72
|
.map do |path|
|
73
|
-
data = Langchain::Loader.new(path)&.load&.chunks
|
73
|
+
data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
|
74
74
|
data.map { |chunk| chunk.text }
|
75
75
|
end
|
76
76
|
|
data/lib/langchain/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: langchainrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Bondarev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
11
|
+
date: 2023-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: baran
|
@@ -618,6 +618,7 @@ files:
|
|
618
618
|
- lib/langchain/agent/sql_query_agent/sql_query_agent_sql_prompt.yaml
|
619
619
|
- lib/langchain/chunk.rb
|
620
620
|
- lib/langchain/chunker/base.rb
|
621
|
+
- lib/langchain/chunker/markdown.rb
|
621
622
|
- lib/langchain/chunker/prompts/semantic_prompt_template.yml
|
622
623
|
- lib/langchain/chunker/recursive_text.rb
|
623
624
|
- lib/langchain/chunker/semantic.rb
|
@@ -677,6 +678,7 @@ files:
|
|
677
678
|
- lib/langchain/processors/html.rb
|
678
679
|
- lib/langchain/processors/json.rb
|
679
680
|
- lib/langchain/processors/jsonl.rb
|
681
|
+
- lib/langchain/processors/markdown.rb
|
680
682
|
- lib/langchain/processors/pdf.rb
|
681
683
|
- lib/langchain/processors/text.rb
|
682
684
|
- lib/langchain/processors/xlsx.rb
|