langchainrb 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5dd13c5aae47af13fe248636ed88bd40d0e241291ab5c3dc2d5925dcc742af37
4
- data.tar.gz: b190f73403a77b4ea4d1f9869423546d584df32785ae342a01d9a72ee5fe04fd
3
+ metadata.gz: 13eec34cc529732ddfb8994956659bd4307a79ebfd76ff883fe3b6644d647c24
4
+ data.tar.gz: ce04acfe42a6a8da5a5951734651dd0083f7d2efc43cf4b3367710c8221ee96a
5
5
  SHA512:
6
- metadata.gz: 81dd80f49173e3d711a713b6dd365addf04129cb0f6c015d6909200a709780e30c39888f0bccba72035e03c17a0b01a4d1456e6431473149d9969907435f18c1
7
- data.tar.gz: 748f841cf01b802e81bc6f6ecf8aaea5ab13593363afadc7c9634446c169812064dd41af3e58e87068a224972be85f00b1e3c2669a99e1406819507c86b1a15c
6
+ metadata.gz: 2094d99610311a1583d890f8c6898605bcd3e76d2fb72deb1ccd4b250f2b98f7a883401faf2e161b97b82fb29f6e64ead8843d8af22f0bd3e8a4c872c150c134
7
+ data.tar.gz: d7ce155cbb992e651aa8dc468ed1ee39bd96d1457f50faa11a32d7caac87086f5d8a381fc2b50aaba10ac934486ed415d5e609f47ee0426b4187540e2436b2e9
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.8.2]
4
+ - Introducing new `Langchain::Chunker::Markdown` chunker (thanks @spikex)
5
+ - Fixes
6
+
3
7
  ## [0.8.1]
4
8
  - Support for Epsilla vector DB
5
9
  - Fully functioning Google Vertex AI LLM
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "baran"
4
+
5
+ module Langchain
6
+ module Chunker
7
+ #
8
+ # Simple text chunker
9
+ #
10
+ # Usage:
11
+ # Langchain::Chunker::Markdown.new(text).chunks
12
+ #
13
+ class Markdown < Base
14
+ attr_reader :text, :chunk_size, :chunk_overlap
15
+
16
+ # @param [String] text
17
+ # @param [Integer] chunk_size
18
+ # @param [Integer] chunk_overlap
19
+ # @param [String] separator
20
+ def initialize(text, chunk_size: 1000, chunk_overlap: 200)
21
+ @text = text
22
+ @chunk_size = chunk_size
23
+ @chunk_overlap = chunk_overlap
24
+ end
25
+
26
+ # @return [Array<Langchain::Chunk>]
27
+ def chunks
28
+ splitter = Baran::MarkdownSplitter.new(
29
+ chunk_size: chunk_size,
30
+ chunk_overlap: chunk_overlap
31
+ )
32
+
33
+ splitter.chunks(text).map do |chunk|
34
+ Langchain::Chunk.new(text: chunk[:text])
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -9,9 +9,10 @@ module Langchain
9
9
 
10
10
  # @param data [String] data that was loaded
11
11
  # @option options [String] :source URL or Path of the data source
12
- def initialize(data, options = {})
13
- @source = options[:source]
12
+ def initialize(data, source: nil, chunker: Langchain::Chunker::Text)
13
+ @source = source
14
14
  @data = data
15
+ @chunker_klass = chunker
15
16
  end
16
17
 
17
18
  # @return [String]
@@ -22,7 +23,7 @@ module Langchain
22
23
  # @param opts [Hash] options passed to the chunker
23
24
  # @return [Array<String>]
24
25
  def chunks(opts = {})
25
- Langchain::Chunker::Text.new(@data, **opts).chunks
26
+ @chunker_klass.new(@data, **opts).chunks
26
27
  end
27
28
  end
28
29
  end
@@ -8,7 +8,7 @@ module Langchain::LLM
8
8
  end
9
9
 
10
10
  def completion
11
- raw_response.first
11
+ completions.first
12
12
  end
13
13
 
14
14
  def completions
@@ -37,9 +37,10 @@ module Langchain
37
37
  # @param path [String | Pathname] path to file or URL
38
38
  # @param options [Hash] options passed to the processor class used to process the data
39
39
  # @return [Langchain::Loader] loader instance
40
- def initialize(path, options = {})
40
+ def initialize(path, options = {}, chunker: Langchain::Chunker::Text)
41
41
  @options = options
42
42
  @path = path
43
+ @chunker = chunker
43
44
  end
44
45
 
45
46
  # Is the path a URL?
@@ -112,7 +113,7 @@ module Langchain
112
113
  processor_klass.new(@options).parse(@raw_data)
113
114
  end
114
115
 
115
- Langchain::Data.new(result)
116
+ Langchain::Data.new(result, source: @options[:source], chunker: @chunker)
116
117
  end
117
118
 
118
119
  def processor_klass
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Markdown < Base
6
+ EXTENSIONS = [".markdown", ".md"]
7
+ CONTENT_TYPES = ["text/markdown"]
8
+
9
+ # Parse the document and return the text
10
+ # @param [File] data
11
+ # @return [String]
12
+ def parse(data)
13
+ data.read
14
+ end
15
+ end
16
+ end
17
+ end
@@ -31,6 +31,10 @@ module Langchain
31
31
  TOKEN_LIMITS[model_name]
32
32
  end
33
33
  singleton_class.alias_method :completion_token_limit, :token_limit
34
+
35
+ def self.token_length_from_messages(messages, model_name, options)
36
+ messages.sum { |message| token_length(message.to_json, model_name, options) }
37
+ end
34
38
  end
35
39
  end
36
40
  end
@@ -14,7 +14,7 @@ module Langchain
14
14
  class BaseValidator
15
15
  def self.validate_max_tokens!(content, model_name, options = {})
16
16
  text_token_length = if content.is_a?(Array)
17
- content.sum { |item| token_length(item.to_json, model_name, options) }
17
+ token_length_from_messages(content, model_name, options)
18
18
  else
19
19
  token_length(content, model_name, options)
20
20
  end
@@ -39,6 +39,10 @@ module Langchain
39
39
  TOKEN_LIMITS[model_name]
40
40
  end
41
41
  singleton_class.alias_method :completion_token_limit, :token_limit
42
+
43
+ def self.token_length_from_messages(messages, model_name, options)
44
+ messages.sum { |message| token_length(message.to_json, model_name, options) }
45
+ end
42
46
  end
43
47
  end
44
48
  end
@@ -43,6 +43,10 @@ module Langchain
43
43
  response.dig("tokenCount")
44
44
  end
45
45
 
46
+ def self.token_length_from_messages(messages, model_name, options)
47
+ messages.sum { |message| token_length(message.to_json, model_name, options) }
48
+ end
49
+
46
50
  def self.token_limit(model_name)
47
51
  TOKEN_LIMITS.dig(model_name, "input_token_limit")
48
52
  end
@@ -75,6 +75,47 @@ module Langchain
75
75
  max_tokens = super(content, model_name, options)
76
76
  [options[:max_tokens], max_tokens].reject(&:nil?).min
77
77
  end
78
+
79
+ # Copied from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
80
+ # Return the number of tokens used by a list of messages
81
+ #
82
+ # @param messages [Array<Hash>] The messages to calculate the token length for
83
+ # @param model [String] The model name to validate against
84
+ # @return [Integer] The token length of the messages
85
+ #
86
+ def self.token_length_from_messages(messages, model_name, options = {})
87
+ encoding = Tiktoken.encoding_for_model(model_name)
88
+
89
+ if ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613"].include?(model_name)
90
+ tokens_per_message = 3
91
+ tokens_per_name = 1
92
+ elsif model_name == "gpt-3.5-turbo-0301"
93
+ tokens_per_message = 4 # every message follows {role/name}\n{content}\n
94
+ tokens_per_name = -1 # if there's a name, the role is omitted
95
+ elsif model_name.include?("gpt-3.5-turbo")
96
+ puts "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613."
97
+ return token_length_from_messages(messages, "gpt-3.5-turbo-0613", options)
98
+ elsif model_name.include?("gpt-4")
99
+ puts "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613."
100
+ return token_length_from_messages(messages, "gpt-4-0613", options)
101
+ else
102
+ raise NotImplementedError.new(
103
+ "token_length_from_messages() is not implemented for model #{model_name}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."
104
+ )
105
+ end
106
+
107
+ num_tokens = 0
108
+ messages.each do |message|
109
+ num_tokens += tokens_per_message
110
+ message.each do |key, value|
111
+ num_tokens += encoding.encode(value).length
112
+ num_tokens += tokens_per_name if ["name", :name].include?(key)
113
+ end
114
+ end
115
+
116
+ num_tokens += 3 # every reply is primed with assistant
117
+ num_tokens
118
+ end
78
119
  end
79
120
  end
80
121
  end
@@ -175,13 +175,13 @@ module Langchain::Vectorsearch
175
175
  prompt_template.format(question: question, context: context)
176
176
  end
177
177
 
178
- def add_data(paths:)
178
+ def add_data(paths:, options: {}, chunker: Langchain::Chunker::Text)
179
179
  raise ArgumentError, "Paths must be provided" if Array(paths).empty?
180
180
 
181
181
  texts = Array(paths)
182
182
  .flatten
183
183
  .map do |path|
184
- data = Langchain::Loader.new(path)&.load&.chunks
184
+ data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
185
185
  data.map { |chunk| chunk.text }
186
186
  end
187
187
 
@@ -36,7 +36,11 @@ module Langchain::Vectorsearch
36
36
  status_code, response = @client.database.load_db(db_name, db_path)
37
37
 
38
38
  if status_code != 200
39
- if status_code == 500 && response["message"].include?("already loaded")
39
+ if status_code == 409 || (status_code == 500 && response["message"].include?("already loaded"))
40
+ # When db is already loaded, Epsilla may return HTTP 409 Conflict.
41
+ # This behavior is changed in https://github.com/epsilla-cloud/vectordb/pull/95
42
+ # Old behavior (HTTP 500) is preserved for backwards compatibility.
43
+ # It does not prevent us from using the db.
40
44
  Langchain.logger.info("Database already loaded")
41
45
  else
42
46
  raise "Failed to load database: #{response}"
@@ -64,13 +64,13 @@ module Langchain::Vectorsearch
64
64
  index.upsert(vectors: vectors, namespace: namespace)
65
65
  end
66
66
 
67
- def add_data(paths:, namespace: "")
67
+ def add_data(paths:, namespace: "", options: {}, chunker: Langchain::Chunker::Text)
68
68
  raise ArgumentError, "Paths must be provided" if Array(paths).empty?
69
69
 
70
70
  texts = Array(paths)
71
71
  .flatten
72
72
  .map do |path|
73
- data = Langchain::Loader.new(path)&.load&.chunks
73
+ data = Langchain::Loader.new(path, options, chunker: chunker)&.load&.chunks
74
74
  data.map { |chunk| chunk.text }
75
75
  end
76
76
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Langchain
4
- VERSION = "0.8.1"
4
+ VERSION = "0.8.2"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: langchainrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Bondarev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-07 00:00:00.000000000 Z
11
+ date: 2023-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: baran
@@ -618,6 +618,7 @@ files:
618
618
  - lib/langchain/agent/sql_query_agent/sql_query_agent_sql_prompt.yaml
619
619
  - lib/langchain/chunk.rb
620
620
  - lib/langchain/chunker/base.rb
621
+ - lib/langchain/chunker/markdown.rb
621
622
  - lib/langchain/chunker/prompts/semantic_prompt_template.yml
622
623
  - lib/langchain/chunker/recursive_text.rb
623
624
  - lib/langchain/chunker/semantic.rb
@@ -677,6 +678,7 @@ files:
677
678
  - lib/langchain/processors/html.rb
678
679
  - lib/langchain/processors/json.rb
679
680
  - lib/langchain/processors/jsonl.rb
681
+ - lib/langchain/processors/markdown.rb
680
682
  - lib/langchain/processors/pdf.rb
681
683
  - lib/langchain/processors/text.rb
682
684
  - lib/langchain/processors/xlsx.rb