langchainrb 0.3.11 → 0.3.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ff7f99d961b09e827df297ddb3144821c9103fd40eabb32688ca92588a73415c
4
- data.tar.gz: bb83eaa99055cf45cceaccb18a84e9fd4ee3ea4a93a6a0c66e04ede43e5d4bc0
3
+ metadata.gz: 974f0a2b8ce3fe42144016bd740ee9d4f7e597834319cc92fbf1d50bd1f4468e
4
+ data.tar.gz: 3686a42c37eb117e6d7485ef4f7777c0f12968bb9cdcc3a30c7721c86c0a4325
5
5
  SHA512:
6
- metadata.gz: 40e5362520220d3ffc1b4c29c3e430b051de334c2f281d9cb7d7549a93be40b26b379dbd35d0c91ccb5010c1a495a653e31768f1b7a95bc087059d59339fd1a7
7
- data.tar.gz: 04f24944b590ee8b577419a95718ad6796bc4cdc34d52cf05e287806912c05ba8ace22e07181a8537892124881b65a8a221e1c75d8c6245231dd5660c6b4308c
6
+ metadata.gz: a61f9b36d9d19eb6cf87af18c7fb40f55d39771257d08a6af2ec3384988419dfb158ffa8fc81c3769c0149f1ffa8b03200366bbea55b03b0d1553912af8d9ae6
7
+ data.tar.gz: 7dc53be923fe5b8587f61617198b24c42e8793fbd8e18c42a17035bf68279c59c37c6c691cabe13c83adc5dc2cff66ea293f198297ab9a9de30aa68ca72bd9c4
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.12] - 2023-05-25
4
+ - 🔍 Vectorsearch
5
+ - Introduce namespace support for Pinecone
6
+ - 🚚 Loaders
7
+ - Loaders overhaul
8
+
3
9
  ## [0.3.11] - 2023-05-23
4
10
  - 🗣️ LLMs
5
11
  - Introducing support for Google PaLM (Pathways Language Model)
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- langchainrb (0.3.11)
4
+ langchainrb (0.3.12)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -271,6 +271,7 @@ GEM
271
271
  zeitwerk (2.6.8)
272
272
 
273
273
  PLATFORMS
274
+ arm64-darwin-21
274
275
  arm64-darwin-22
275
276
  x86_64-darwin-19
276
277
  x86_64-darwin-22
data/README.md CHANGED
@@ -268,12 +268,28 @@ agent.run(question: "How many full soccer fields would be needed to cover the di
268
268
 
269
269
  Need to read data from various sources? Load it up.
270
270
 
271
- | Name | Class | Gem Requirements |
272
- | ---- | ------------- | :--------------------------: |
273
- | docx | Loaders::Docx | `gem "docx", "~> 0.8.0"` |
274
- | html | Loaders::HTML | `gem "nokogiri", "~> 1.13"` |
275
- | pdf | Loaders::PDF | `gem "pdf-reader", "~> 1.4"` |
276
- | text | Loaders::Text | |
271
+ ##### Usage
272
+
273
+ Just call `Langchan::Loader.load` with the path to the file or a URL you want to load.
274
+
275
+ ```ruby
276
+ Langchaing::Loader.load('/path/to/file.pdf')
277
+ ```
278
+
279
+ or
280
+
281
+ ```ruby
282
+ Langchain::Loader.load('https://www.example.com/file.pdf')
283
+ ```
284
+
285
+ ##### Supported Formats
286
+
287
+ | Format | Pocessor | Gem Requirements |
288
+ | ------ | ---------------- | :--------------------------: |
289
+ | docx | Processors::Docx | `gem "docx", "~> 0.8.0"` |
290
+ | html | Processors::HTML | `gem "nokogiri", "~> 1.13"` |
291
+ | pdf | Processors::PDF | `gem "pdf-reader", "~> 1.4"` |
292
+ | text | Processors::Text | |
277
293
 
278
294
  ## Examples
279
295
  Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)
@@ -26,8 +26,8 @@ docs = [
26
26
  ]
27
27
 
28
28
  # Add data to the index. Weaviate will use OpenAI to generate embeddings behind the scene.
29
- chroma.add_texts(
30
- texts: docs
29
+ chroma.add_data(
30
+ paths: docs
31
31
  )
32
32
 
33
33
  # Query your data
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open-uri"
4
+
5
+ module Langchain
6
+ class Loader
7
+ class FileNotFound < StandardError; end
8
+
9
+ class UnknownFormatError < StandardError; end
10
+
11
+ URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
12
+
13
+ # Load data from a file or url
14
+ # Equivalent to Langchain::Loader.new(path).load
15
+ # @param path [String | Pathname] path to file or url
16
+ # @return [String] file content
17
+ def self.load(path)
18
+ new(path).load
19
+ end
20
+
21
+ # Initialize Langchain::Loader
22
+ # @param path [String | Pathname] path to file or url
23
+ # @return [Langchain::Loader] loader instance
24
+ def initialize(path)
25
+ @path = path
26
+ end
27
+
28
+ # Check if path is url
29
+ # @return [Boolean] true if path is url
30
+ def url?
31
+ return false if @path.is_a?(Pathname)
32
+
33
+ !!(@path =~ URI_REGEX)
34
+ end
35
+
36
+ # Load data from a file or url
37
+ # @return [String] file content
38
+ def load
39
+ url? ? from_url(@path) : from_path(@path)
40
+ end
41
+
42
+ private
43
+
44
+ def from_url(url)
45
+ process do
46
+ data = URI.parse(url).open
47
+ processor = find_processor(:CONTENT_TYPES, data.content_type)
48
+ [data, processor]
49
+ end
50
+ end
51
+
52
+ def from_path(path)
53
+ raise FileNotFound unless File.exist?(path)
54
+
55
+ process do
56
+ [File.open(path), find_processor(:EXTENSIONS, File.extname(path))]
57
+ end
58
+ end
59
+
60
+ def process(&block)
61
+ data, processor = yield
62
+
63
+ raise UnknownFormatError unless processor
64
+
65
+ Langchain::Processors.const_get(processor).new.parse(data)
66
+ end
67
+
68
+ def find_processor(constant, value)
69
+ processors.find { |klass| processor_matches? "#{klass}::#{constant}", value }
70
+ end
71
+
72
+ def processor_matches?(constant, value)
73
+ Langchain::Processors.const_get(constant).include?(value)
74
+ end
75
+
76
+ def processors
77
+ Langchain::Processors.constants
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Base
6
+ EXTENSIONS = []
7
+ CONTENT_TYPES = []
8
+
9
+ def parse(data)
10
+ raise NotImplementedError
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Docx < Base
6
+ EXTENSIONS = [".docx"]
7
+ CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
8
+
9
+ def initialize
10
+ depends_on "docx"
11
+ require "docx"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [String]
17
+ def parse(data)
18
+ ::Docx::Document
19
+ .open(StringIO.new(data.read))
20
+ .text
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class HTML < Base
6
+ EXTENSIONS = [".html", ".htm"]
7
+ CONTENT_TYPES = ["text/html"]
8
+
9
+ # We only look for headings and paragraphs
10
+ TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
11
+
12
+ def initialize
13
+ depends_on "nokogiri"
14
+ require "nokogiri"
15
+ end
16
+
17
+ # Parse the document and return the text
18
+ # @param [File] data
19
+ # @return [String]
20
+ def parse(data)
21
+ Nokogiri::HTML(data.read)
22
+ .css(TEXT_CONTENT_TAGS.join(","))
23
+ .map(&:inner_text)
24
+ .join("\n\n")
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class PDF < Base
6
+ EXTENSIONS = [".pdf"]
7
+ CONTENT_TYPES = ["application/pdf"]
8
+
9
+ def initialize
10
+ depends_on "pdf-reader"
11
+ require "pdf-reader"
12
+ end
13
+
14
+ # Parse the document and return the text
15
+ # @param [File] data
16
+ # @return [String]
17
+ def parse(data)
18
+ ::PDF::Reader
19
+ .new(StringIO.new(data.read))
20
+ .pages
21
+ .map(&:text)
22
+ .join("\n\n")
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Langchain
4
+ module Processors
5
+ class Text < Base
6
+ EXTENSIONS = [".txt"]
7
+ CONTENT_TYPES = ["text/plain"]
8
+
9
+ # Parse the document and return the text
10
+ # @param [File] data
11
+ # @return [String]
12
+ def parse(data)
13
+ data.read
14
+ end
15
+ end
16
+ end
17
+ end
data/lib/langchain.rb CHANGED
@@ -7,7 +7,6 @@ require_relative "./version"
7
7
  require_relative "./dependency_helper"
8
8
  module Langchain
9
9
  class << self
10
- attr_accessor :default_loaders
11
10
  attr_accessor :logger
12
11
 
13
12
  attr_reader :root
@@ -16,6 +15,16 @@ module Langchain
16
15
  @logger ||= ::Logger.new($stdout, level: :warn, formatter: ->(severity, datetime, progname, msg) { "[LangChain.rb] #{msg}\n" })
17
16
 
18
17
  @root = Pathname.new(__dir__)
18
+
19
+ autoload :Loader, "langchain/loader"
20
+
21
+ module Processors
22
+ autoload :Base, "langchain/processors/base"
23
+ autoload :PDF, "langchain/processors/pdf"
24
+ autoload :HTML, "langchain/processors/html"
25
+ autoload :Text, "langchain/processors/text"
26
+ autoload :Docx, "langchain/processors/docx"
27
+ end
19
28
  end
20
29
 
21
30
  module Agent
@@ -55,16 +64,3 @@ module Tool
55
64
  autoload :SerpApi, "tool/serp_api"
56
65
  autoload :Wikipedia, "tool/wikipedia"
57
66
  end
58
-
59
- module Loaders
60
- autoload :Base, "loaders/base"
61
- autoload :Docx, "loaders/docx"
62
- autoload :PDF, "loaders/pdf"
63
- autoload :Text, "loaders/text"
64
- autoload :HTML, "loaders/html"
65
- end
66
-
67
- autoload :Loader, "loader"
68
-
69
- # Load the default Loaders
70
- Langchain.default_loaders ||= [::Loaders::Text, ::Loaders::PDF, ::Loaders::Docx]
@@ -0,0 +1 @@
1
+ require "langchain"
@@ -19,8 +19,6 @@ module Vectorsearch
19
19
  @llm_api_key = llm_api_key
20
20
 
21
21
  @llm_client = LLM.const_get(LLM::Base::LLMS.fetch(llm)).new(api_key: llm_api_key)
22
-
23
- @loaders = Langchain.default_loaders
24
22
  end
25
23
 
26
24
  # Method supported by Vectorsearch DB to create a default schema
@@ -74,18 +72,12 @@ module Vectorsearch
74
72
  raise ArgumentError, "Either path or paths must be provided" if path.nil? && paths.nil?
75
73
  raise ArgumentError, "Either path or paths must be provided, not both" if !path.nil? && !paths.nil?
76
74
 
77
- texts =
78
- Loader
79
- .with(*loaders)
80
- .load(path || paths)
75
+ texts = Array(path || paths)
76
+ .flatten
77
+ .map { |path| Langchain::Loader.new(path)&.load }
78
+ .compact
81
79
 
82
80
  add_texts(texts: texts)
83
81
  end
84
-
85
- attr_reader :loaders
86
-
87
- def add_loader(*loaders)
88
- loaders.each { |loader| @loaders << loader }
89
- end
90
82
  end
91
83
  end
@@ -25,20 +25,22 @@ module Vectorsearch
25
25
 
26
26
  # Add a list of texts to the index
27
27
  # @param texts [Array] The list of texts to add
28
+ # @param namespace [String] The namespace to add the texts to
29
+ # @param metadata [Hash] The metadata to use for the texts
28
30
  # @return [Hash] The response from the server
29
- def add_texts(texts:)
31
+ def add_texts(texts:, namespace: "", metadata: nil)
30
32
  vectors = texts.map do |text|
31
33
  {
32
34
  # TODO: Allows passing in your own IDs
33
35
  id: SecureRandom.uuid,
34
- metadata: {content: text},
36
+ metadata: metadata || {content: text},
35
37
  values: llm_client.embed(text: text)
36
38
  }
37
39
  end
38
40
 
39
41
  index = client.index(index_name)
40
42
 
41
- index.upsert(vectors: vectors)
43
+ index.upsert(vectors: vectors, namespace: namespace)
42
44
  end
43
45
 
44
46
  # Create the index with the default schema
@@ -54,40 +56,54 @@ module Vectorsearch
54
56
  # Search for similar texts
55
57
  # @param query [String] The text to search for
56
58
  # @param k [Integer] The number of results to return
59
+ # @param namespace [String] The namespace to search in
60
+ # @param filter [String] The filter to use
57
61
  # @return [Array] The list of results
58
62
  def similarity_search(
59
63
  query:,
60
- k: 4
64
+ k: 4,
65
+ namespace: "",
66
+ filter: nil
61
67
  )
62
68
  embedding = llm_client.embed(text: query)
63
69
 
64
70
  similarity_search_by_vector(
65
71
  embedding: embedding,
66
- k: k
72
+ k: k,
73
+ namespace: namespace,
74
+ filter: filter
67
75
  )
68
76
  end
69
77
 
70
78
  # Search for similar texts by embedding
71
79
  # @param embedding [Array] The embedding to search for
72
80
  # @param k [Integer] The number of results to return
81
+ # @param namespace [String] The namespace to search in
82
+ # @param filter [String] The filter to use
73
83
  # @return [Array] The list of results
74
- def similarity_search_by_vector(embedding:, k: 4)
84
+ def similarity_search_by_vector(embedding:, k: 4, namespace: "", filter: nil)
75
85
  index = client.index(index_name)
76
86
 
77
- response = index.query(
87
+ query_params = {
78
88
  vector: embedding,
89
+ namespace: namespace,
90
+ filter: filter,
79
91
  top_k: k,
80
92
  include_values: true,
81
93
  include_metadata: true
82
- )
94
+ }.compact
95
+
96
+ response = index.query(query_params)
83
97
  response.dig("matches")
84
98
  end
85
99
 
86
100
  # Ask a question and return the answer
87
101
  # @param question [String] The question to ask
102
+ # @param namespace [String] The namespace to search in
103
+ # @param filter [String] The filter to use
88
104
  # @return [String] The answer to the question
89
- def ask(question:)
90
- search_results = similarity_search(query: question)
105
+ def ask(question:, namespace: "", filter: nil)
106
+ search_results = similarity_search(query: question, namespace: namespace, filter: filter)
91
107
 
92
108
  context = search_results.map do |result|
93
109
  result.dig("metadata").to_s
data/lib/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Langchain
4
- VERSION = "0.3.11"
4
+ VERSION = "0.3.12"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: langchainrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11
4
+ version: 0.3.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Bondarev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-24 00:00:00.000000000 Z
11
+ date: 2023-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: dotenv-rails
@@ -288,18 +288,19 @@ files:
288
288
  - lib/agent/chain_of_thought_agent/chain_of_thought_agent_prompt.json
289
289
  - lib/dependency_helper.rb
290
290
  - lib/langchain.rb
291
+ - lib/langchain/loader.rb
292
+ - lib/langchain/processors/base.rb
293
+ - lib/langchain/processors/docx.rb
294
+ - lib/langchain/processors/html.rb
295
+ - lib/langchain/processors/pdf.rb
296
+ - lib/langchain/processors/text.rb
297
+ - lib/langchainrb.rb
291
298
  - lib/llm/base.rb
292
299
  - lib/llm/cohere.rb
293
300
  - lib/llm/google_palm.rb
294
301
  - lib/llm/hugging_face.rb
295
302
  - lib/llm/openai.rb
296
303
  - lib/llm/replicate.rb
297
- - lib/loader.rb
298
- - lib/loaders/base.rb
299
- - lib/loaders/docx.rb
300
- - lib/loaders/html.rb
301
- - lib/loaders/pdf.rb
302
- - lib/loaders/text.rb
303
304
  - lib/prompt/base.rb
304
305
  - lib/prompt/few_shot_prompt_template.rb
305
306
  - lib/prompt/loading.rb
data/lib/loader.rb DELETED
@@ -1,26 +0,0 @@
1
- module Loader
2
- def self.with(*loaders)
3
- LoaderSet.new(loaders)
4
- end
5
-
6
- class LoaderSet
7
- def initialize(loaders)
8
- @loaders = Array(loaders)
9
- end
10
-
11
- def load(*paths)
12
- Array(paths)
13
- .flatten
14
- .map { |path| first_loadable_loader(path)&.load }
15
- .compact
16
- end
17
-
18
- def first_loadable_loader(path)
19
- @loaders
20
- .each do |loader_klass|
21
- loader_instance = loader_klass.new(path)
22
- return(loader_instance) if loader_instance.loadable?
23
- end
24
- end
25
- end
26
- end
data/lib/loaders/base.rb DELETED
@@ -1,19 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # TODO: Add chunking options to the loaders
4
-
5
- module Loaders
6
- class Base
7
- def self.load(path)
8
- new.load(path)
9
- end
10
-
11
- def initialize(path)
12
- @path = path
13
- end
14
-
15
- def loadable?
16
- raise NotImplementedError
17
- end
18
- end
19
- end
data/lib/loaders/docx.rb DELETED
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Loaders
4
- class Docx < Base
5
- #
6
- # This Loader parses Docx files into text.
7
- # If you'd like to use it directly you can do so like this:
8
- # Loaders::Docx.new("path/to/my.docx").load
9
- #
10
- # This parser is also invoked when you're adding data to a Vectorsearch DB:
11
- # qdrant = Vectorsearch::Qdrant.new(...)
12
- # path = Langchain.root.join("path/to/my.docx")
13
- # qdrant.add_data(path: path)
14
- #
15
-
16
- def initialize(path)
17
- depends_on "docx"
18
- require "docx"
19
-
20
- @path = path
21
- end
22
-
23
- # Check that the file is a `.docx` file
24
- def loadable?
25
- @path.to_s.end_with?(".docx")
26
- end
27
-
28
- def load
29
- ::Docx::Document
30
- .open(@path.to_s)
31
- .text
32
- end
33
- end
34
- end
data/lib/loaders/html.rb DELETED
@@ -1,38 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "open-uri"
4
-
5
- module Loaders
6
- class HTML < Base
7
- # We only look for headings and paragraphs
8
- TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
9
-
10
- #
11
- # This Loader parses URL into a text.
12
- # If you'd like to use it directly you can do so like this:
13
- # Loaders::URL.new("https://nokogiri.org/").load
14
- #
15
- def initialize(url)
16
- depends_on "nokogiri"
17
- require "nokogiri"
18
-
19
- @url = url
20
- end
21
-
22
- # Check that url is a valid URL
23
- def loadable?
24
- !!(@url =~ URI::DEFAULT_PARSER.make_regexp)
25
- end
26
-
27
- def load
28
- return unless response.status.first == "200"
29
-
30
- doc = Nokogiri::HTML(response.read)
31
- doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
32
- end
33
-
34
- def response
35
- @response ||= URI.parse(@url).open
36
- end
37
- end
38
- end
data/lib/loaders/pdf.rb DELETED
@@ -1,36 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Loaders
4
- class PDF < Base
5
- #
6
- # This Loader parses PDF files into text.
7
- # If you'd like to use it directly you can do so like this:
8
- # Loaders::PDF.new("path/to/my.pdf").load
9
- #
10
- # This parser is also invoked when you're adding data to a Vectorsearch DB:
11
- # qdrant = Vectorsearch::Qdrant.new(...)
12
- # path = Langchain.root.join("path/to/my.pdf")
13
- # qdrant.add_data(path: path)
14
- #
15
-
16
- def initialize(path)
17
- depends_on "pdf-reader"
18
- require "pdf-reader"
19
-
20
- @path = path
21
- end
22
-
23
- # Check that the file is a PDF file
24
- def loadable?
25
- @path.to_s.end_with?(".pdf")
26
- end
27
-
28
- def load
29
- ::PDF::Reader
30
- .new(@path)
31
- .pages
32
- .map(&:text)
33
- .join("\n\n")
34
- end
35
- end
36
- end
data/lib/loaders/text.rb DELETED
@@ -1,24 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Loaders
4
- class Text < Base
5
- #
6
- # This Loader parses .txt files.
7
- # If you'd like to use it directly you can do so like this:
8
- # Loaders::Text.new("path/to/my.txt").load
9
- #
10
- # This parser is also invoked when you're adding data to a Vectorsearch DB:
11
- # qdrant = Vectorsearch::Qdrant.new(...)
12
- # path = Langchain.root.join("path/to/my.txt")
13
- # qdrant.add_data(path: path)
14
- #
15
-
16
- def loadable?
17
- @path.to_s.end_with?(".txt")
18
- end
19
-
20
- def load
21
- @path.read
22
- end
23
- end
24
- end