langchainrb 0.8.1 → 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5dd13c5aae47af13fe248636ed88bd40d0e241291ab5c3dc2d5925dcc742af37
4
- data.tar.gz: b190f73403a77b4ea4d1f9869423546d584df32785ae342a01d9a72ee5fe04fd
3
+ metadata.gz: 13eec34cc529732ddfb8994956659bd4307a79ebfd76ff883fe3b6644d647c24
4
+ data.tar.gz: ce04acfe42a6a8da5a5951734651dd0083f7d2efc43cf4b3367710c8221ee96a
5
5
  SHA512:
6
- metadata.gz: 81dd80f49173e3d711a713b6dd365addf04129cb0f6c015d6909200a709780e30c39888f0bccba72035e03c17a0b01a4d1456e6431473149d9969907435f18c1
7
- data.tar.gz: 748f841cf01b802e81bc6f6ecf8aaea5ab13593363afadc7c9634446c169812064dd41af3e58e87068a224972be85f00b1e3c2669a99e1406819507c86b1a15c
6
+ metadata.gz: 2094d99610311a1583d890f8c6898605bcd3e76d2fb72deb1ccd4b250f2b98f7a883401faf2e161b97b82fb29f6e64ead8843d8af22f0bd3e8a4c872c150c134
7
+ data.tar.gz: d7ce155cbb992e651aa8dc468ed1ee39bd96d1457f50faa11a32d7caac87086f5d8a381fc2b50aaba10ac934486ed415d5e609f47ee0426b4187540e2436b2e9
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.8.2]
4
+ - Introducing new `Langchain::Chunker::Markdown` chunker (thanks @spikex)
5
+ - Fixes
6
+
3
7
  ## [0.8.1]
4
8
  - Support for Epsilla vector DB
5
9
  - Fully functioning Google Vertex AI LLM
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "baran"
4
+
5
+ module Langchain
6
+ module Chunker
7
+ #
8
+ # Simple text chunker
9
+ #
10
+ # Usage:
11
+ # Langchain::Chunker::Markdown.new(text).chunks
12
+ #
13
+ class Markdown < Base
14
+ attr_reader :text, :chunk_size, :chunk_overlap
15
+
16
+ # @param [String] text
17
+ # @param [Integer] chunk_size
18
+ # @param [Integer] chunk_overlap
19
+ # @param [String] separator
20
+ def initialize(text, chunk_size: 1000, chunk_overlap: 200)
21
+ @text = text
22
+ @chunk_size = chunk_size
23
+ @chunk_overlap = chunk_overlap
24
+ end
25
+
26
+ # @return [Array<Langchain::Chunk>]
27
+ def chunks
28
+ splitter = Baran::MarkdownSplitter.new(
29
+ chunk_size: chunk_size,
30
+ chunk_overlap: chunk_overlap
31
+ )
32
+
33
+ splitter.chunks(text).map do |chunk|
34
+ Langchain::Chunk.new(text: chunk[:text])
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -9,9 +9,10 @@ module Langchain
9
9
 
10
10
  # @param data [String] data that was loaded
11
11
  # @option options [String] :source URL or Path of the data source
12
- def initialize(data, options = {})
13
- @source = options[:source]
12
+ def initialize(data, source: nil, chunker: Langchain::Chunker::Text)
13
+ @source = source
14
14
  @data = data
15
+ @chunker_klass = chunker
15
16
  end
16
17
 
17
18
  # @return [String]
@@ -22,7 +23,7 @@ module Langchain
22
23
  # @param opts [Hash] options passed to the chunker
23
24
  # @return [Array<String>]
24
25
  def chunks(opts = {})
25
- Langchain::Chunker::Text.new(@data, **opts).chunks
26
+ @chunker_klass.new(@data, **opts).chunks
26
27
  end
27
28
  end
28
29
  end
@@ -8,7 +8,7 @@ module Langchain::LLM
8
8
  end
9
9
 
10
10
  def completion
11
- raw_response.first
11
+ completions.first
12
12
  end
13
13
 
14
14
  def completions
@@ -37,9 +37,10 @@ module Langchain
37
37
  # @param path [String | Pathname] path to file or URL
38
38
  # @param options [Hash] options passed to the processor class used to process the data
39
39
  # @return [Langchain::Loader] loader instance
40
- def initialize(path, options = {})
40
+ def initialize(path, options = {}, chunker: Langchain::Chunker::Text)
41
41
  @options = options
42
42
  @path = path
43
+ @chunker = chunker
43
44
  end
44
45
 
45
46
  # Is the path a URL?
@@ -112,7 +113,7 @@ module Langchain
112
113
  processor_klass.new(@options).parse(@raw_data)
113
114
  end
114
115
 
115
- Langchain::Data.new(result)
116
+ Langchain::Data.new(result, source: @options[:source], chunker: @chunker)
116
117
  end
117
118
 
118
119
  def processor_klass
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Markdown < Base
6
+ EXTENSIONS = [".markdown", ".md"]
7
+ CONTENT_TYPES = ["text/markdown"]
8
+
9
+ # Parse the document and return the text
10
+ # @param [File] data
11
+ # @return [String]
12
+ def parse(data)
13
+ data.read
14
+ end
15
+ end
16
+ end
17
+ end
@@ -31,6 +31,10 @@ module Langchain
31
31
  TOKEN_LIMITS[model_name]
32
32
  end
33
33
  singleton_class.alias_method :completion_token_limit, :token_limit
34
+
35
+ def self.token_length_from_messages(messages, model_name, options)
36
+ messages.sum { |message| token_length(message.to_json, model_name, options) }
37
+ end
34
38
  end
35
39
  end
36
40
  end
@@ -14,7 +14,7 @@ module Langchain
14
14
  class BaseValidator
15
15
  def self.validate_max_tokens!(content, model_name, options = {})
16
16
  text_token_length = if content.is_a?(Array)
17
- content.sum { |item| token_length(item.to_json, model_name, options) }
17
+ token_length_from_messages(content, model_name, options)
18
18
  else
19
19
  token_length(content, model_name, options)
20
20
  end
@@ -39,6 +39,10 @@ module Langchain
39
39
  TOKEN_LIMITS[model_name]
40
40
  end
41
41
  singleton_class.alias_method :completion_token_limit, :token_limit
42
+
43
+ def self.token_length_from_messages(messages, model_name, options)
44
+ messages.sum { |message| token_length(message.to_json, model_name, options) }
45
+ end
42
46
  end
43
47
  end
44
48
  end
@@ -43,6 +43,10 @@ module Langchain
43
43
  response.dig("tokenCount")
44
44
  end
45
45
 
46
+ def self.token_length_from_messages(messages, model_name, options)
47
+ messages.sum { |message| token_length(message.to_json, model_name, options) }
48
+ end
49
+
46
50
  def self.token_limit(model_name)
47
51
  TOKEN_LIMITS.dig(model_name, "input_token_limit")
48
52
  end
@@ -75,6 +75,47 @@ module Langchain
75
75
  max_tokens = super(content, model_name, options)
76
76
  [options[:max_tokens], max_tokens].reject(&:nil?).min
77
77
  end
78
+
79
+ # Copied from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
80
+ # Return the number of tokens used by a list of messages
81
+ #
82
+ # @param messages [Array<Hash>] The messages to calculate the token length for
83
+ # @param model [String] The model name to validate against
84
+ # @return [Integer] The token length of the messages
85
+ #
86
+ def self.token_length_from_messages(messages, model_name, options = {})
87
+ encoding = Tiktoken.encoding_for_model(model_name)
88
+
89
+ if ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613"].include?(model_name)
90
+ tokens_per_message = 3
91
+ tokens_per_name = 1
92
+ elsif model_name == "gpt-3.5-turbo-0301"
93
+ tokens_per_message = 4 # every message follows {role/name}\n{content}\n
94
+ tokens_per_name = -1 # if there's a name, the role is omitted
95
+ elsif model_name.include?("gpt-3.5-turbo")
96
+ puts "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613."
97
+ return token_length_from_messages(messages, "gpt-3.5-turbo-0613", options)
98
+ elsif model_name.include?("gpt-4")
99
+ puts "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613."
100
+ return token_length_from_messages(messages, "gpt-4-0613", options)
101
+ else
102
+ raise NotImplementedError.new(
103
+ "token_length_from_messages() is not implemented for model #{model_name}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."
104
+ )
105
+ end
106
+
107
+ num_tokens = 0
108
+ messages.each do |message|
109
+ num_tokens += tokens_per_message
110
+ message.each do |key, value|
111
+ num_tokens += encoding.encode(value).length
112
+ num_tokens += tokens_per_name if ["name", :name].include?(key)
113
+ end
114
+ end
115
+
116
+ num_tokens += 3 # every reply is primed with assistant
117
+ num_tokens
118
+ end
78
119
  end
79
120
  end
80
121
  end
@@ -175,13 +175,13 @@ module Langchain::Vectorsearch
175
175
  prompt_template.format(question: question, context: context)
176
176
  end
177
177
 
178
- def add_data(paths:)
178
+ def add_data(paths:, options: {}, chunker: Langchain::Chunker::Text)
179
179
  raise ArgumentError, "Paths must be provided" if Array(paths).empty?
180
180
 
181
181
  texts = Array(paths)
182
182
  .flatten
183
183
  .map do |path|
184
- data = Langchain::Loader.new(path)&.load&.chunks
184
+ data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
185
185
  data.map { |chunk| chunk.text }
186
186
  end
187
187
 
@@ -36,7 +36,11 @@ module Langchain::Vectorsearch
36
36
  status_code, response = @client.database.load_db(db_name, db_path)
37
37
 
38
38
  if status_code != 200
39
- if status_code == 500 && response["message"].include?("already loaded")
39
+ if status_code == 409 || (status_code == 500 && response["message"].include?("already loaded"))
40
+ # When db is already loaded, Epsilla may return HTTP 409 Conflict.
41
+ # This behavior is changed in https://github.com/epsilla-cloud/vectordb/pull/95
42
+ # Old behavior (HTTP 500) is preserved for backwards compatibility.
43
+ # It does not prevent us from using the db.
40
44
  Langchain.logger.info("Database already loaded")
41
45
  else
42
46
  raise "Failed to load database: #{response}"
@@ -64,13 +64,13 @@ module Langchain::Vectorsearch
64
64
  index.upsert(vectors: vectors, namespace: namespace)
65
65
  end
66
66
 
67
- def add_data(paths:, namespace: "")
67
+ def add_data(paths:, namespace: "", options: {}, chunker: Langchain::Chunker::Text)
68
68
  raise ArgumentError, "Paths must be provided" if Array(paths).empty?
69
69
 
70
70
  texts = Array(paths)
71
71
  .flatten
72
72
  .map do |path|
73
- data = Langchain::Loader.new(path)&.load&.chunks
73
+ data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
74
74
  data.map { |chunk| chunk.text }
75
75
  end
76
76
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Langchain
4
- VERSION = "0.8.1"
4
+ VERSION = "0.8.2"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: langchainrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Bondarev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-07 00:00:00.000000000 Z
11
+ date: 2023-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: baran
@@ -618,6 +618,7 @@ files:
618
618
  - lib/langchain/agent/sql_query_agent/sql_query_agent_sql_prompt.yaml
619
619
  - lib/langchain/chunk.rb
620
620
  - lib/langchain/chunker/base.rb
621
+ - lib/langchain/chunker/markdown.rb
621
622
  - lib/langchain/chunker/prompts/semantic_prompt_template.yml
622
623
  - lib/langchain/chunker/recursive_text.rb
623
624
  - lib/langchain/chunker/semantic.rb
@@ -677,6 +678,7 @@ files:
677
678
  - lib/langchain/processors/html.rb
678
679
  - lib/langchain/processors/json.rb
679
680
  - lib/langchain/processors/jsonl.rb
681
+ - lib/langchain/processors/markdown.rb
680
682
  - lib/langchain/processors/pdf.rb
681
683
  - lib/langchain/processors/text.rb
682
684
  - lib/langchain/processors/xlsx.rb