langchainrb 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/langchain/chunker/markdown.rb +39 -0
- data/lib/langchain/data.rb +4 -3
- data/lib/langchain/llm/response/ollama_response.rb +1 -1
- data/lib/langchain/loader.rb +3 -2
- data/lib/langchain/processors/markdown.rb +17 -0
- data/lib/langchain/utils/token_length/ai21_validator.rb +4 -0
- data/lib/langchain/utils/token_length/base_validator.rb +1 -1
- data/lib/langchain/utils/token_length/cohere_validator.rb +4 -0
- data/lib/langchain/utils/token_length/google_palm_validator.rb +4 -0
- data/lib/langchain/utils/token_length/openai_validator.rb +41 -0
- data/lib/langchain/vectorsearch/base.rb +2 -2
- data/lib/langchain/vectorsearch/epsilla.rb +5 -1
- data/lib/langchain/vectorsearch/pinecone.rb +2 -2
- data/lib/langchain/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13eec34cc529732ddfb8994956659bd4307a79ebfd76ff883fe3b6644d647c24
|
4
|
+
data.tar.gz: ce04acfe42a6a8da5a5951734651dd0083f7d2efc43cf4b3367710c8221ee96a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2094d99610311a1583d890f8c6898605bcd3e76d2fb72deb1ccd4b250f2b98f7a883401faf2e161b97b82fb29f6e64ead8843d8af22f0bd3e8a4c872c150c134
|
7
|
+
data.tar.gz: d7ce155cbb992e651aa8dc468ed1ee39bd96d1457f50faa11a32d7caac87086f5d8a381fc2b50aaba10ac934486ed415d5e609f47ee0426b4187540e2436b2e9
|
data/CHANGELOG.md
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "baran"
|
4
|
+
|
5
|
+
module Langchain
|
6
|
+
module Chunker
|
7
|
+
#
|
8
|
+
# Simple text chunker
|
9
|
+
#
|
10
|
+
# Usage:
|
11
|
+
# Langchain::Chunker::Markdown.new(text).chunks
|
12
|
+
#
|
13
|
+
class Markdown < Base
|
14
|
+
attr_reader :text, :chunk_size, :chunk_overlap
|
15
|
+
|
16
|
+
# @param [String] text
|
17
|
+
# @param [Integer] chunk_size
|
18
|
+
# @param [Integer] chunk_overlap
|
19
|
+
# @param [String] separator
|
20
|
+
def initialize(text, chunk_size: 1000, chunk_overlap: 200)
|
21
|
+
@text = text
|
22
|
+
@chunk_size = chunk_size
|
23
|
+
@chunk_overlap = chunk_overlap
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Array<Langchain::Chunk>]
|
27
|
+
def chunks
|
28
|
+
splitter = Baran::MarkdownSplitter.new(
|
29
|
+
chunk_size: chunk_size,
|
30
|
+
chunk_overlap: chunk_overlap
|
31
|
+
)
|
32
|
+
|
33
|
+
splitter.chunks(text).map do |chunk|
|
34
|
+
Langchain::Chunk.new(text: chunk[:text])
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/langchain/data.rb
CHANGED
@@ -9,9 +9,10 @@ module Langchain
|
|
9
9
|
|
10
10
|
# @param data [String] data that was loaded
|
11
11
|
# @option options [String] :source URL or Path of the data source
|
12
|
-
def initialize(data,
|
13
|
-
@source =
|
12
|
+
def initialize(data, source: nil, chunker: Langchain::Chunker::Text)
|
13
|
+
@source = source
|
14
14
|
@data = data
|
15
|
+
@chunker_klass = chunker
|
15
16
|
end
|
16
17
|
|
17
18
|
# @return [String]
|
@@ -22,7 +23,7 @@ module Langchain
|
|
22
23
|
# @param opts [Hash] options passed to the chunker
|
23
24
|
# @return [Array<String>]
|
24
25
|
def chunks(opts = {})
|
25
|
-
|
26
|
+
@chunker_klass.new(@data, **opts).chunks
|
26
27
|
end
|
27
28
|
end
|
28
29
|
end
|
data/lib/langchain/loader.rb
CHANGED
@@ -37,9 +37,10 @@ module Langchain
|
|
37
37
|
# @param path [String | Pathname] path to file or URL
|
38
38
|
# @param options [Hash] options passed to the processor class used to process the data
|
39
39
|
# @return [Langchain::Loader] loader instance
|
40
|
-
def initialize(path, options = {})
|
40
|
+
def initialize(path, options = {}, chunker: Langchain::Chunker::Text)
|
41
41
|
@options = options
|
42
42
|
@path = path
|
43
|
+
@chunker = chunker
|
43
44
|
end
|
44
45
|
|
45
46
|
# Is the path a URL?
|
@@ -112,7 +113,7 @@ module Langchain
|
|
112
113
|
processor_klass.new(@options).parse(@raw_data)
|
113
114
|
end
|
114
115
|
|
115
|
-
Langchain::Data.new(result)
|
116
|
+
Langchain::Data.new(result, source: @options[:source], chunker: @chunker)
|
116
117
|
end
|
117
118
|
|
118
119
|
def processor_klass
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class Markdown < Base
|
6
|
+
EXTENSIONS = [".markdown", ".md"]
|
7
|
+
CONTENT_TYPES = ["text/markdown"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [String]
|
12
|
+
def parse(data)
|
13
|
+
data.read
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -31,6 +31,10 @@ module Langchain
|
|
31
31
|
TOKEN_LIMITS[model_name]
|
32
32
|
end
|
33
33
|
singleton_class.alias_method :completion_token_limit, :token_limit
|
34
|
+
|
35
|
+
def self.token_length_from_messages(messages, model_name, options)
|
36
|
+
messages.sum { |message| token_length(message.to_json, model_name, options) }
|
37
|
+
end
|
34
38
|
end
|
35
39
|
end
|
36
40
|
end
|
@@ -14,7 +14,7 @@ module Langchain
|
|
14
14
|
class BaseValidator
|
15
15
|
def self.validate_max_tokens!(content, model_name, options = {})
|
16
16
|
text_token_length = if content.is_a?(Array)
|
17
|
-
content
|
17
|
+
token_length_from_messages(content, model_name, options)
|
18
18
|
else
|
19
19
|
token_length(content, model_name, options)
|
20
20
|
end
|
@@ -39,6 +39,10 @@ module Langchain
|
|
39
39
|
TOKEN_LIMITS[model_name]
|
40
40
|
end
|
41
41
|
singleton_class.alias_method :completion_token_limit, :token_limit
|
42
|
+
|
43
|
+
def self.token_length_from_messages(messages, model_name, options)
|
44
|
+
messages.sum { |message| token_length(message.to_json, model_name, options) }
|
45
|
+
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
end
|
@@ -43,6 +43,10 @@ module Langchain
|
|
43
43
|
response.dig("tokenCount")
|
44
44
|
end
|
45
45
|
|
46
|
+
def self.token_length_from_messages(messages, model_name, options)
|
47
|
+
messages.sum { |message| token_length(message.to_json, model_name, options) }
|
48
|
+
end
|
49
|
+
|
46
50
|
def self.token_limit(model_name)
|
47
51
|
TOKEN_LIMITS.dig(model_name, "input_token_limit")
|
48
52
|
end
|
@@ -75,6 +75,47 @@ module Langchain
|
|
75
75
|
max_tokens = super(content, model_name, options)
|
76
76
|
[options[:max_tokens], max_tokens].reject(&:nil?).min
|
77
77
|
end
|
78
|
+
|
79
|
+
# Copied from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
|
80
|
+
# Return the number of tokens used by a list of messages
|
81
|
+
#
|
82
|
+
# @param messages [Array<Hash>] The messages to calculate the token length for
|
83
|
+
# @param model [String] The model name to validate against
|
84
|
+
# @return [Integer] The token length of the messages
|
85
|
+
#
|
86
|
+
def self.token_length_from_messages(messages, model_name, options = {})
|
87
|
+
encoding = Tiktoken.encoding_for_model(model_name)
|
88
|
+
|
89
|
+
if ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613"].include?(model_name)
|
90
|
+
tokens_per_message = 3
|
91
|
+
tokens_per_name = 1
|
92
|
+
elsif model_name == "gpt-3.5-turbo-0301"
|
93
|
+
tokens_per_message = 4 # every message follows {role/name}\n{content}\n
|
94
|
+
tokens_per_name = -1 # if there's a name, the role is omitted
|
95
|
+
elsif model_name.include?("gpt-3.5-turbo")
|
96
|
+
puts "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613."
|
97
|
+
return token_length_from_messages(messages, "gpt-3.5-turbo-0613", options)
|
98
|
+
elsif model_name.include?("gpt-4")
|
99
|
+
puts "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613."
|
100
|
+
return token_length_from_messages(messages, "gpt-4-0613", options)
|
101
|
+
else
|
102
|
+
raise NotImplementedError.new(
|
103
|
+
"token_length_from_messages() is not implemented for model #{model_name}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."
|
104
|
+
)
|
105
|
+
end
|
106
|
+
|
107
|
+
num_tokens = 0
|
108
|
+
messages.each do |message|
|
109
|
+
num_tokens += tokens_per_message
|
110
|
+
message.each do |key, value|
|
111
|
+
num_tokens += encoding.encode(value).length
|
112
|
+
num_tokens += tokens_per_name if ["name", :name].include?(key)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
num_tokens += 3 # every reply is primed with assistant
|
117
|
+
num_tokens
|
118
|
+
end
|
78
119
|
end
|
79
120
|
end
|
80
121
|
end
|
@@ -175,13 +175,13 @@ module Langchain::Vectorsearch
|
|
175
175
|
prompt_template.format(question: question, context: context)
|
176
176
|
end
|
177
177
|
|
178
|
-
def add_data(paths:)
|
178
|
+
def add_data(paths:, options: {}, chunker: Langchain::Chunker::Text)
|
179
179
|
raise ArgumentError, "Paths must be provided" if Array(paths).empty?
|
180
180
|
|
181
181
|
texts = Array(paths)
|
182
182
|
.flatten
|
183
183
|
.map do |path|
|
184
|
-
data = Langchain::Loader.new(path)&.load&.chunks
|
184
|
+
data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
|
185
185
|
data.map { |chunk| chunk.text }
|
186
186
|
end
|
187
187
|
|
@@ -36,7 +36,11 @@ module Langchain::Vectorsearch
|
|
36
36
|
status_code, response = @client.database.load_db(db_name, db_path)
|
37
37
|
|
38
38
|
if status_code != 200
|
39
|
-
if status_code == 500 && response["message"].include?("already loaded")
|
39
|
+
if status_code == 409 || (status_code == 500 && response["message"].include?("already loaded"))
|
40
|
+
# When db is already loaded, Epsilla may return HTTP 409 Conflict.
|
41
|
+
# This behavior is changed in https://github.com/epsilla-cloud/vectordb/pull/95
|
42
|
+
# Old behavior (HTTP 500) is preserved for backwards compatibility.
|
43
|
+
# It does not prevent us from using the db.
|
40
44
|
Langchain.logger.info("Database already loaded")
|
41
45
|
else
|
42
46
|
raise "Failed to load database: #{response}"
|
@@ -64,13 +64,13 @@ module Langchain::Vectorsearch
|
|
64
64
|
index.upsert(vectors: vectors, namespace: namespace)
|
65
65
|
end
|
66
66
|
|
67
|
-
def add_data(paths:, namespace: "")
|
67
|
+
def add_data(paths:, namespace: "", options: {}, chunker: Langchain::Chunker::Text)
|
68
68
|
raise ArgumentError, "Paths must be provided" if Array(paths).empty?
|
69
69
|
|
70
70
|
texts = Array(paths)
|
71
71
|
.flatten
|
72
72
|
.map do |path|
|
73
|
-
data = Langchain::Loader.new(path)&.load&.chunks
|
73
|
+
data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
|
74
74
|
data.map { |chunk| chunk.text }
|
75
75
|
end
|
76
76
|
|
data/lib/langchain/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: langchainrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Bondarev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
11
|
+
date: 2023-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: baran
|
@@ -618,6 +618,7 @@ files:
|
|
618
618
|
- lib/langchain/agent/sql_query_agent/sql_query_agent_sql_prompt.yaml
|
619
619
|
- lib/langchain/chunk.rb
|
620
620
|
- lib/langchain/chunker/base.rb
|
621
|
+
- lib/langchain/chunker/markdown.rb
|
621
622
|
- lib/langchain/chunker/prompts/semantic_prompt_template.yml
|
622
623
|
- lib/langchain/chunker/recursive_text.rb
|
623
624
|
- lib/langchain/chunker/semantic.rb
|
@@ -677,6 +678,7 @@ files:
|
|
677
678
|
- lib/langchain/processors/html.rb
|
678
679
|
- lib/langchain/processors/json.rb
|
679
680
|
- lib/langchain/processors/jsonl.rb
|
681
|
+
- lib/langchain/processors/markdown.rb
|
680
682
|
- lib/langchain/processors/pdf.rb
|
681
683
|
- lib/langchain/processors/text.rb
|
682
684
|
- lib/langchain/processors/xlsx.rb
|