langchainrb 0.3.11 → 0.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ff7f99d961b09e827df297ddb3144821c9103fd40eabb32688ca92588a73415c
4
- data.tar.gz: bb83eaa99055cf45cceaccb18a84e9fd4ee3ea4a93a6a0c66e04ede43e5d4bc0
3
+ metadata.gz: 974f0a2b8ce3fe42144016bd740ee9d4f7e597834319cc92fbf1d50bd1f4468e
4
+ data.tar.gz: 3686a42c37eb117e6d7485ef4f7777c0f12968bb9cdcc3a30c7721c86c0a4325
5
5
  SHA512:
6
- metadata.gz: 40e5362520220d3ffc1b4c29c3e430b051de334c2f281d9cb7d7549a93be40b26b379dbd35d0c91ccb5010c1a495a653e31768f1b7a95bc087059d59339fd1a7
7
- data.tar.gz: 04f24944b590ee8b577419a95718ad6796bc4cdc34d52cf05e287806912c05ba8ace22e07181a8537892124881b65a8a221e1c75d8c6245231dd5660c6b4308c
6
+ metadata.gz: a61f9b36d9d19eb6cf87af18c7fb40f55d39771257d08a6af2ec3384988419dfb158ffa8fc81c3769c0149f1ffa8b03200366bbea55b03b0d1553912af8d9ae6
7
+ data.tar.gz: 7dc53be923fe5b8587f61617198b24c42e8793fbd8e18c42a17035bf68279c59c37c6c691cabe13c83adc5dc2cff66ea293f198297ab9a9de30aa68ca72bd9c4
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.12] - 2023-05-25
4
+ - 🔍 Vectorsearch
5
+ - Introduce namespace support for Pinecone
6
+ - 🚚 Loaders
7
+ - Loaders overhaul
8
+
3
9
  ## [0.3.11] - 2023-05-23
4
10
  - 🗣️ LLMs
5
11
  - Introducing support for Google PaLM (Pathways Language Model)
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- langchainrb (0.3.11)
4
+ langchainrb (0.3.12)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -271,6 +271,7 @@ GEM
271
271
  zeitwerk (2.6.8)
272
272
 
273
273
  PLATFORMS
274
+ arm64-darwin-21
274
275
  arm64-darwin-22
275
276
  x86_64-darwin-19
276
277
  x86_64-darwin-22
data/README.md CHANGED
@@ -268,12 +268,28 @@ agent.run(question: "How many full soccer fields would be needed to cover the di
268
268
 
269
269
  Need to read data from various sources? Load it up.
270
270
 
271
- | Name | Class | Gem Requirements |
272
- | ---- | ------------- | :--------------------------: |
273
- | docx | Loaders::Docx | `gem "docx", "~> 0.8.0"` |
274
- | html | Loaders::HTML | `gem "nokogiri", "~> 1.13"` |
275
- | pdf | Loaders::PDF | `gem "pdf-reader", "~> 1.4"` |
276
- | text | Loaders::Text | |
271
+ ##### Usage
272
+
273
+ Just call `Langchan::Loader.load` with the path to the file or a URL you want to load.
274
+
275
+ ```ruby
276
+ Langchaing::Loader.load('/path/to/file.pdf')
277
+ ```
278
+
279
+ or
280
+
281
+ ```ruby
282
+ Langchain::Loader.load('https://www.example.com/file.pdf')
283
+ ```
284
+
285
+ ##### Supported Formats
286
+
287
+ | Format | Pocessor | Gem Requirements |
288
+ | ------ | ---------------- | :--------------------------: |
289
+ | docx | Processors::Docx | `gem "docx", "~> 0.8.0"` |
290
+ | html | Processors::HTML | `gem "nokogiri", "~> 1.13"` |
291
+ | pdf | Processors::PDF | `gem "pdf-reader", "~> 1.4"` |
292
+ | text | Processors::Text | |
277
293
 
278
294
  ## Examples
279
295
  Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)
@@ -26,8 +26,8 @@ docs = [
26
26
  ]
27
27
 
28
28
  # Add data to the index. Weaviate will use OpenAI to generate embeddings behind the scene.
29
- chroma.add_texts(
30
- texts: docs
29
+ chroma.add_data(
30
+ paths: docs
31
31
  )
32
32
 
33
33
  # Query your data
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open-uri"
4
+
5
+ module Langchain
6
+ class Loader
7
+ class FileNotFound < StandardError; end
8
+
9
+ class UnknownFormatError < StandardError; end
10
+
11
+ URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
12
+
13
+ # Load data from a file or url
14
+ # Equivalent to Langchain::Loader.new(path).load
15
+ # @param path [String | Pathname] path to file or url
16
+ # @return [String] file content
17
+ def self.load(path)
18
+ new(path).load
19
+ end
20
+
21
+ # Initialize Langchain::Loader
22
+ # @param path [String | Pathname] path to file or url
23
+ # @return [Langchain::Loader] loader instance
24
+ def initialize(path)
25
+ @path = path
26
+ end
27
+
28
+ # Check if path is url
29
+ # @return [Boolean] true if path is url
30
+ def url?
31
+ return false if @path.is_a?(Pathname)
32
+
33
+ !!(@path =~ URI_REGEX)
34
+ end
35
+
36
+ # Load data from a file or url
37
+ # @return [String] file content
38
+ def load
39
+ url? ? from_url(@path) : from_path(@path)
40
+ end
41
+
42
+ private
43
+
44
+ def from_url(url)
45
+ process do
46
+ data = URI.parse(url).open
47
+ processor = find_processor(:CONTENT_TYPES, data.content_type)
48
+ [data, processor]
49
+ end
50
+ end
51
+
52
+ def from_path(path)
53
+ raise FileNotFound unless File.exist?(path)
54
+
55
+ process do
56
+ [File.open(path), find_processor(:EXTENSIONS, File.extname(path))]
57
+ end
58
+ end
59
+
60
+ def process(&block)
61
+ data, processor = yield
62
+
63
+ raise UnknownFormatError unless processor
64
+
65
+ Langchain::Processors.const_get(processor).new.parse(data)
66
+ end
67
+
68
+ def find_processor(constant, value)
69
+ processors.find { |klass| processor_matches? "#{klass}::#{constant}", value }
70
+ end
71
+
72
+ def processor_matches?(constant, value)
73
+ Langchain::Processors.const_get(constant).include?(value)
74
+ end
75
+
76
+ def processors
77
+ Langchain::Processors.constants
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Base
6
+ EXTENSIONS = []
7
+ CONTENT_TYPES = []
8
+
9
+ def parse(data)
10
+ raise NotImplementedError
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Docx < Base
6
+ EXTENSIONS = [".docx"]
7
+ CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
8
+
9
+ def initialize
10
+ depends_on "docx"
11
+ require "docx"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [String]
17
+ def parse(data)
18
+ ::Docx::Document
19
+ .open(StringIO.new(data.read))
20
+ .text
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class HTML < Base
6
+ EXTENSIONS = [".html", ".htm"]
7
+ CONTENT_TYPES = ["text/html"]
8
+
9
+ # We only look for headings and paragraphs
10
+ TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
11
+
12
+ def initialize
13
+ depends_on "nokogiri"
14
+ require "nokogiri"
15
+ end
16
+
17
+ # Parse the document and return the text
18
+ # @param [File] data
19
+ # @return [String]
20
+ def parse(data)
21
+ Nokogiri::HTML(data.read)
22
+ .css(TEXT_CONTENT_TAGS.join(","))
23
+ .map(&:inner_text)
24
+ .join("\n\n")
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class PDF < Base
6
+ EXTENSIONS = [".pdf"]
7
+ CONTENT_TYPES = ["application/pdf"]
8
+
9
+ def initialize
10
+ depends_on "pdf-reader"
11
+ require "pdf-reader"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [String]
17
+ def parse(data)
18
+ ::PDF::Reader
19
+ .new(StringIO.new(data.read))
20
+ .pages
21
+ .map(&:text)
22
+ .join("\n\n")
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Text < Base
6
+ EXTENSIONS = [".txt"]
7
+ CONTENT_TYPES = ["text/plain"]
8
+
9
+ # Parse the document and return the text
10
+ # @param [File] data
11
+ # @return [String]
12
+ def parse(data)
13
+ data.read
14
+ end
15
+ end
16
+ end
17
+ end
data/lib/langchain.rb CHANGED
@@ -7,7 +7,6 @@ require_relative "./version"
7
7
  require_relative "./dependency_helper"
8
8
  module Langchain
9
9
  class << self
10
- attr_accessor :default_loaders
11
10
  attr_accessor :logger
12
11
 
13
12
  attr_reader :root
@@ -16,6 +15,16 @@ module Langchain
16
15
  @logger ||= ::Logger.new($stdout, level: :warn, formatter: ->(severity, datetime, progname, msg) { "[LangChain.rb] #{msg}\n" })
17
16
 
18
17
  @root = Pathname.new(__dir__)
18
+
19
+ autoload :Loader, "langchain/loader"
20
+
21
+ module Processors
22
+ autoload :Base, "langchain/processors/base"
23
+ autoload :PDF, "langchain/processors/pdf"
24
+ autoload :HTML, "langchain/processors/html"
25
+ autoload :Text, "langchain/processors/text"
26
+ autoload :Docx, "langchain/processors/docx"
27
+ end
19
28
  end
20
29
 
21
30
  module Agent
@@ -55,16 +64,3 @@ module Tool
55
64
  autoload :SerpApi, "tool/serp_api"
56
65
  autoload :Wikipedia, "tool/wikipedia"
57
66
  end
58
-
59
- module Loaders
60
- autoload :Base, "loaders/base"
61
- autoload :Docx, "loaders/docx"
62
- autoload :PDF, "loaders/pdf"
63
- autoload :Text, "loaders/text"
64
- autoload :HTML, "loaders/html"
65
- end
66
-
67
- autoload :Loader, "loader"
68
-
69
- # Load the default Loaders
70
- Langchain.default_loaders ||= [::Loaders::Text, ::Loaders::PDF, ::Loaders::Docx]
@@ -0,0 +1 @@
1
+ require "langchain"
@@ -19,8 +19,6 @@ module Vectorsearch
19
19
  @llm_api_key = llm_api_key
20
20
 
21
21
  @llm_client = LLM.const_get(LLM::Base::LLMS.fetch(llm)).new(api_key: llm_api_key)
22
-
23
- @loaders = Langchain.default_loaders
24
22
  end
25
23
 
26
24
  # Method supported by Vectorsearch DB to create a default schema
@@ -74,18 +72,12 @@ module Vectorsearch
74
72
  raise ArgumentError, "Either path or paths must be provided" if path.nil? && paths.nil?
75
73
  raise ArgumentError, "Either path or paths must be provided, not both" if !path.nil? && !paths.nil?
76
74
 
77
- texts =
78
- Loader
79
- .with(*loaders)
80
- .load(path || paths)
75
+ texts = Array(path || paths)
76
+ .flatten
77
+ .map { |path| Langchain::Loader.new(path)&.load }
78
+ .compact
81
79
 
82
80
  add_texts(texts: texts)
83
81
  end
84
-
85
- attr_reader :loaders
86
-
87
- def add_loader(*loaders)
88
- loaders.each { |loader| @loaders << loader }
89
- end
90
82
  end
91
83
  end
@@ -25,20 +25,22 @@ module Vectorsearch
25
25
 
26
26
  # Add a list of texts to the index
27
27
  # @param texts [Array] The list of texts to add
28
+ # @param namespace [String] The namespace to add the texts to
29
+ # @param metadata [Hash] The metadata to use for the texts
28
30
  # @return [Hash] The response from the server
29
- def add_texts(texts:)
31
+ def add_texts(texts:, namespace: "", metadata: nil)
30
32
  vectors = texts.map do |text|
31
33
  {
32
34
  # TODO: Allows passing in your own IDs
33
35
  id: SecureRandom.uuid,
34
- metadata: {content: text},
36
+ metadata: metadata || {content: text},
35
37
  values: llm_client.embed(text: text)
36
38
  }
37
39
  end
38
40
 
39
41
  index = client.index(index_name)
40
42
 
41
- index.upsert(vectors: vectors)
43
+ index.upsert(vectors: vectors, namespace: namespace)
42
44
  end
43
45
 
44
46
  # Create the index with the default schema
@@ -54,40 +56,54 @@ module Vectorsearch
54
56
  # Search for similar texts
55
57
  # @param query [String] The text to search for
56
58
  # @param k [Integer] The number of results to return
59
+ # @param namespace [String] The namespace to search in
60
+ # @param filter [String] The filter to use
57
61
  # @return [Array] The list of results
58
62
  def similarity_search(
59
63
  query:,
60
- k: 4
64
+ k: 4,
65
+ namespace: "",
66
+ filter: nil
61
67
  )
62
68
  embedding = llm_client.embed(text: query)
63
69
 
64
70
  similarity_search_by_vector(
65
71
  embedding: embedding,
66
- k: k
72
+ k: k,
73
+ namespace: namespace,
74
+ filter: filter
67
75
  )
68
76
  end
69
77
 
70
78
  # Search for similar texts by embedding
71
79
  # @param embedding [Array] The embedding to search for
72
80
  # @param k [Integer] The number of results to return
81
+ # @param namespace [String] The namespace to search in
82
+ # @param filter [String] The filter to use
73
83
  # @return [Array] The list of results
74
- def similarity_search_by_vector(embedding:, k: 4)
84
+ def similarity_search_by_vector(embedding:, k: 4, namespace: "", filter: nil)
75
85
  index = client.index(index_name)
76
86
 
77
- response = index.query(
87
+ query_params = {
78
88
  vector: embedding,
89
+ namespace: namespace,
90
+ filter: filter,
79
91
  top_k: k,
80
92
  include_values: true,
81
93
  include_metadata: true
82
- )
94
+ }.compact
95
+
96
+ response = index.query(query_params)
83
97
  response.dig("matches")
84
98
  end
85
99
 
86
100
  # Ask a question and return the answer
87
101
  # @param question [String] The question to ask
102
+ # @param namespace [String] The namespace to search in
103
+ # @param filter [String] The filter to use
88
104
  # @return [String] The answer to the question
89
- def ask(question:)
90
- search_results = similarity_search(query: question)
105
+ def ask(question:, namespace: "", filter: nil)
106
+ search_results = similarity_search(query: question, namespace: namespace, filter: filter)
91
107
 
92
108
  context = search_results.map do |result|
93
109
  result.dig("metadata").to_s
data/lib/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Langchain
4
- VERSION = "0.3.11"
4
+ VERSION = "0.3.12"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: langchainrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11
4
+ version: 0.3.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Bondarev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-24 00:00:00.000000000 Z
11
+ date: 2023-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: dotenv-rails
@@ -288,18 +288,19 @@ files:
288
288
  - lib/agent/chain_of_thought_agent/chain_of_thought_agent_prompt.json
289
289
  - lib/dependency_helper.rb
290
290
  - lib/langchain.rb
291
+ - lib/langchain/loader.rb
292
+ - lib/langchain/processors/base.rb
293
+ - lib/langchain/processors/docx.rb
294
+ - lib/langchain/processors/html.rb
295
+ - lib/langchain/processors/pdf.rb
296
+ - lib/langchain/processors/text.rb
297
+ - lib/langchainrb.rb
291
298
  - lib/llm/base.rb
292
299
  - lib/llm/cohere.rb
293
300
  - lib/llm/google_palm.rb
294
301
  - lib/llm/hugging_face.rb
295
302
  - lib/llm/openai.rb
296
303
  - lib/llm/replicate.rb
297
- - lib/loader.rb
298
- - lib/loaders/base.rb
299
- - lib/loaders/docx.rb
300
- - lib/loaders/html.rb
301
- - lib/loaders/pdf.rb
302
- - lib/loaders/text.rb
303
304
  - lib/prompt/base.rb
304
305
  - lib/prompt/few_shot_prompt_template.rb
305
306
  - lib/prompt/loading.rb
data/lib/loader.rb DELETED
@@ -1,26 +0,0 @@
1
- module Loader
2
- def self.with(*loaders)
3
- LoaderSet.new(loaders)
4
- end
5
-
6
- class LoaderSet
7
- def initialize(loaders)
8
- @loaders = Array(loaders)
9
- end
10
-
11
- def load(*paths)
12
- Array(paths)
13
- .flatten
14
- .map { |path| first_loadable_loader(path)&.load }
15
- .compact
16
- end
17
-
18
- def first_loadable_loader(path)
19
- @loaders
20
- .each do |loader_klass|
21
- loader_instance = loader_klass.new(path)
22
- return(loader_instance) if loader_instance.loadable?
23
- end
24
- end
25
- end
26
- end
data/lib/loaders/base.rb DELETED
@@ -1,19 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # TODO: Add chunking options to the loaders
4
-
5
- module Loaders
6
- class Base
7
- def self.load(path)
8
- new.load(path)
9
- end
10
-
11
- def initialize(path)
12
- @path = path
13
- end
14
-
15
- def loadable?
16
- raise NotImplementedError
17
- end
18
- end
19
- end
data/lib/loaders/docx.rb DELETED
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Loaders
4
- class Docx < Base
5
- #
6
- # This Loader parses Docx files into text.
7
- # If you'd like to use it directly you can do so like this:
8
- # Loaders::Docx.new("path/to/my.docx").load
9
- #
10
- # This parser is also invoked when you're adding data to a Vectorsearch DB:
11
- # qdrant = Vectorsearch::Qdrant.new(...)
12
- # path = Langchain.root.join("path/to/my.docx")
13
- # qdrant.add_data(path: path)
14
- #
15
-
16
- def initialize(path)
17
- depends_on "docx"
18
- require "docx"
19
-
20
- @path = path
21
- end
22
-
23
- # Check that the file is a `.docx` file
24
- def loadable?
25
- @path.to_s.end_with?(".docx")
26
- end
27
-
28
- def load
29
- ::Docx::Document
30
- .open(@path.to_s)
31
- .text
32
- end
33
- end
34
- end
data/lib/loaders/html.rb DELETED
@@ -1,38 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "open-uri"
4
-
5
- module Loaders
6
- class HTML < Base
7
- # We only look for headings and paragraphs
8
- TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
9
-
10
- #
11
- # This Loader parses URL into a text.
12
- # If you'd like to use it directly you can do so like this:
13
- # Loaders::URL.new("https://nokogiri.org/").load
14
- #
15
- def initialize(url)
16
- depends_on "nokogiri"
17
- require "nokogiri"
18
-
19
- @url = url
20
- end
21
-
22
- # Check that url is a valid URL
23
- def loadable?
24
- !!(@url =~ URI::DEFAULT_PARSER.make_regexp)
25
- end
26
-
27
- def load
28
- return unless response.status.first == "200"
29
-
30
- doc = Nokogiri::HTML(response.read)
31
- doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
32
- end
33
-
34
- def response
35
- @response ||= URI.parse(@url).open
36
- end
37
- end
38
- end
data/lib/loaders/pdf.rb DELETED
@@ -1,36 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Loaders
4
- class PDF < Base
5
- #
6
- # This Loader parses PDF files into text.
7
- # If you'd like to use it directly you can do so like this:
8
- # Loaders::PDF.new("path/to/my.pdf").load
9
- #
10
- # This parser is also invoked when you're adding data to a Vectorsearch DB:
11
- # qdrant = Vectorsearch::Qdrant.new(...)
12
- # path = Langchain.root.join("path/to/my.pdf")
13
- # qdrant.add_data(path: path)
14
- #
15
-
16
- def initialize(path)
17
- depends_on "pdf-reader"
18
- require "pdf-reader"
19
-
20
- @path = path
21
- end
22
-
23
- # Check that the file is a PDF file
24
- def loadable?
25
- @path.to_s.end_with?(".pdf")
26
- end
27
-
28
- def load
29
- ::PDF::Reader
30
- .new(@path)
31
- .pages
32
- .map(&:text)
33
- .join("\n\n")
34
- end
35
- end
36
- end
data/lib/loaders/text.rb DELETED
@@ -1,24 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Loaders
4
- class Text < Base
5
- #
6
- # This Loader parses .txt files.
7
- # If you'd like to use it directly you can do so like this:
8
- # Loaders::Text.new("path/to/my.txt").load
9
- #
10
- # This parser is also invoked when you're adding data to a Vectorsearch DB:
11
- # qdrant = Vectorsearch::Qdrant.new(...)
12
- # path = Langchain.root.join("path/to/my.txt")
13
- # qdrant.add_data(path: path)
14
- #
15
-
16
- def loadable?
17
- @path.to_s.end_with?(".txt")
18
- end
19
-
20
- def load
21
- @path.read
22
- end
23
- end
24
- end