RubyGems - langchainrb - Versions diffs - 0.3.11 → 0.3.12 - Mend

langchainrb 0.3.11 → 0.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/Gemfile.lock +2 -1
data/README.md +22 -6
data/examples/pdf_store_and_query_with_chroma.rb +2 -2
data/lib/langchain/loader.rb +80 -0
data/lib/langchain/processors/base.rb +14 -0
data/lib/langchain/processors/docx.rb +24 -0
data/lib/langchain/processors/html.rb +28 -0
data/lib/langchain/processors/pdf.rb +26 -0
data/lib/langchain/processors/text.rb +17 -0
data/lib/langchain.rb +10 -14
data/lib/langchainrb.rb +1 -0
data/lib/vectorsearch/base.rb +4 -12
data/lib/vectorsearch/pinecone.rb +26 -10
data/lib/version.rb +1 -1
metadata +9 -8
data/lib/loader.rb +0 -26
data/lib/loaders/base.rb +0 -19
data/lib/loaders/docx.rb +0 -34
data/lib/loaders/html.rb +0 -38
data/lib/loaders/pdf.rb +0 -36
data/lib/loaders/text.rb +0 -24

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ff7f99d961b09e827df297ddb3144821c9103fd40eabb32688ca92588a73415c
-  data.tar.gz: bb83eaa99055cf45cceaccb18a84e9fd4ee3ea4a93a6a0c66e04ede43e5d4bc0
+  metadata.gz: 974f0a2b8ce3fe42144016bd740ee9d4f7e597834319cc92fbf1d50bd1f4468e
+  data.tar.gz: 3686a42c37eb117e6d7485ef4f7777c0f12968bb9cdcc3a30c7721c86c0a4325
 SHA512:
-  metadata.gz: 40e5362520220d3ffc1b4c29c3e430b051de334c2f281d9cb7d7549a93be40b26b379dbd35d0c91ccb5010c1a495a653e31768f1b7a95bc087059d59339fd1a7
-  data.tar.gz: 04f24944b590ee8b577419a95718ad6796bc4cdc34d52cf05e287806912c05ba8ace22e07181a8537892124881b65a8a221e1c75d8c6245231dd5660c6b4308c
+  metadata.gz: a61f9b36d9d19eb6cf87af18c7fb40f55d39771257d08a6af2ec3384988419dfb158ffa8fc81c3769c0149f1ffa8b03200366bbea55b03b0d1553912af8d9ae6
+  data.tar.gz: 7dc53be923fe5b8587f61617198b24c42e8793fbd8e18c42a17035bf68279c59c37c6c691cabe13c83adc5dc2cff66ea293f198297ab9a9de30aa68ca72bd9c4

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 ## [Unreleased]
+## [0.3.12] - 2023-05-25
+- 🔍 Vectorsearch
+  - Introduce namespace support for Pinecone
+- 🚚 Loaders
+  - Loaders overhaul
 ## [0.3.11] - 2023-05-23
 - 🗣️ LLMs
   - Introducing support for Google PaLM (Pathways Language Model)

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    langchainrb (0.3.11)
+    langchainrb (0.3.12)
 GEM
   remote: https://rubygems.org/
@@ -271,6 +271,7 @@ GEM
     zeitwerk (2.6.8)
 PLATFORMS
+  arm64-darwin-21
   arm64-darwin-22
   x86_64-darwin-19
   x86_64-darwin-22

data/README.md CHANGED Viewed

@@ -268,12 +268,28 @@ agent.run(question: "How many full soccer fields would be needed to cover the di
 Need to read data from various sources? Load it up.
-| Name | Class         | Gem Requirements             |
-| ---- | ------------- | :--------------------------: |
-| docx | Loaders::Docx | `gem "docx", "~> 0.8.0"`     |
-| html | Loaders::HTML | `gem "nokogiri", "~> 1.13"`  |
-| pdf  | Loaders::PDF  | `gem "pdf-reader", "~> 1.4"` |
-| text | Loaders::Text |                              |
+##### Usage
+Just call `Langchan::Loader.load` with the path to the file or a URL you want to load.
+```ruby
+Langchaing::Loader.load('/path/to/file.pdf')
+```
+or
+```ruby
+Langchain::Loader.load('https://www.example.com/file.pdf')
+```
+##### Supported Formats
+| Format | Pocessor         |       Gem Requirements       |
+| ------ | ---------------- | :--------------------------: |
+| docx   | Processors::Docx |   `gem "docx", "~> 0.8.0"`   |
+| html   | Processors::HTML | `gem "nokogiri", "~> 1.13"`  |
+| pdf    | Processors::PDF  | `gem "pdf-reader", "~> 1.4"` |
+| text   | Processors::Text |                              |
 ## Examples
 Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)

data/examples/pdf_store_and_query_with_chroma.rb CHANGED Viewed

@@ -26,8 +26,8 @@ docs = [
 ]
 # Add data to the index. Weaviate will use OpenAI to generate embeddings behind the scene.
-chroma.add_texts(
-  texts: docs
+chroma.add_data(
+  paths: docs
 )
 # Query your data

data/lib/langchain/loader.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# frozen_string_literal: true
+require "open-uri"
+module Langchain
+  class Loader
+    class FileNotFound < StandardError; end
+    class UnknownFormatError < StandardError; end
+    URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
+    # Load data from a file or url
+    # Equivalent to Langchain::Loader.new(path).load
+    # @param path [String | Pathname] path to file or url
+    # @return [String] file content
+    def self.load(path)
+      new(path).load
+    end
+    # Initialize Langchain::Loader
+    # @param path [String | Pathname] path to file or url
+    # @return [Langchain::Loader] loader instance
+    def initialize(path)
+      @path = path
+    end
+    # Check if path is url
+    # @return [Boolean] true if path is url
+    def url?
+      return false if @path.is_a?(Pathname)
+      !!(@path =~ URI_REGEX)
+    end
+    # Load data from a file or url
+    # @return [String] file content
+    def load
+      url? ? from_url(@path) : from_path(@path)
+    end
+    private
+    def from_url(url)
+      process do
+        data = URI.parse(url).open
+        processor = find_processor(:CONTENT_TYPES, data.content_type)
+        [data, processor]
+      end
+    end
+    def from_path(path)
+      raise FileNotFound unless File.exist?(path)
+      process do
+        [File.open(path), find_processor(:EXTENSIONS, File.extname(path))]
+      end
+    end
+    def process(&block)
+      data, processor = yield
+      raise UnknownFormatError unless processor
+      Langchain::Processors.const_get(processor).new.parse(data)
+    end
+    def find_processor(constant, value)
+      processors.find { |klass| processor_matches? "#{klass}::#{constant}", value }
+    end
+    def processor_matches?(constant, value)
+      Langchain::Processors.const_get(constant).include?(value)
+    end
+    def processors
+      Langchain::Processors.constants
+    end
+  end
+end

data/lib/langchain/processors/base.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class Base
+      EXTENSIONS = []
+      CONTENT_TYPES = []
+      def parse(data)
+        raise NotImplementedError
+      end
+    end
+  end
+end

data/lib/langchain/processors/docx.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class Docx < Base
+      EXTENSIONS = [".docx"]
+      CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+      def initialize
+        depends_on "docx"
+        require "docx"
+      end
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        ::Docx::Document
+          .open(StringIO.new(data.read))
+          .text
+      end
+    end
+  end
+end

data/lib/langchain/processors/html.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class HTML < Base
+      EXTENSIONS = [".html", ".htm"]
+      CONTENT_TYPES = ["text/html"]
+      # We only look for headings and paragraphs
+      TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
+      def initialize
+        depends_on "nokogiri"
+        require "nokogiri"
+      end
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        Nokogiri::HTML(data.read)
+          .css(TEXT_CONTENT_TAGS.join(","))
+          .map(&:inner_text)
+          .join("\n\n")
+      end
+    end
+  end
+end

data/lib/langchain/processors/pdf.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class PDF < Base
+      EXTENSIONS = [".pdf"]
+      CONTENT_TYPES = ["application/pdf"]
+      def initialize
+        depends_on "pdf-reader"
+        require "pdf-reader"
+      end
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        ::PDF::Reader
+          .new(StringIO.new(data.read))
+          .pages
+          .map(&:text)
+          .join("\n\n")
+      end
+    end
+  end
+end

data/lib/langchain/processors/text.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class Text < Base
+      EXTENSIONS = [".txt"]
+      CONTENT_TYPES = ["text/plain"]
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        data.read
+      end
+    end
+  end
+end

data/lib/langchain.rb CHANGED Viewed

@@ -7,7 +7,6 @@ require_relative "./version"
 require_relative "./dependency_helper"
 module Langchain
   class << self
-    attr_accessor :default_loaders
     attr_accessor :logger
     attr_reader :root
@@ -16,6 +15,16 @@ module Langchain
   @logger ||= ::Logger.new($stdout, level: :warn, formatter: ->(severity, datetime, progname, msg) { "[LangChain.rb] #{msg}\n" })
   @root = Pathname.new(__dir__)
+  autoload :Loader, "langchain/loader"
+  module Processors
+    autoload :Base, "langchain/processors/base"
+    autoload :PDF, "langchain/processors/pdf"
+    autoload :HTML, "langchain/processors/html"
+    autoload :Text, "langchain/processors/text"
+    autoload :Docx, "langchain/processors/docx"
+  end
 end
 module Agent
@@ -55,16 +64,3 @@ module Tool
   autoload :SerpApi, "tool/serp_api"
   autoload :Wikipedia, "tool/wikipedia"
 end
-module Loaders
-  autoload :Base, "loaders/base"
-  autoload :Docx, "loaders/docx"
-  autoload :PDF, "loaders/pdf"
-  autoload :Text, "loaders/text"
-  autoload :HTML, "loaders/html"
-end
-autoload :Loader, "loader"
-# Load the default Loaders
-Langchain.default_loaders ||= [::Loaders::Text, ::Loaders::PDF, ::Loaders::Docx]

data/lib/langchainrb.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "langchain"

data/lib/vectorsearch/base.rb CHANGED Viewed

@@ -19,8 +19,6 @@ module Vectorsearch
       @llm_api_key = llm_api_key
       @llm_client = LLM.const_get(LLM::Base::LLMS.fetch(llm)).new(api_key: llm_api_key)
-      @loaders = Langchain.default_loaders
     end
     # Method supported by Vectorsearch DB to create a default schema
@@ -74,18 +72,12 @@ module Vectorsearch
       raise ArgumentError, "Either path or paths must be provided" if path.nil? && paths.nil?
       raise ArgumentError, "Either path or paths must be provided, not both" if !path.nil? && !paths.nil?
-      texts =
-        Loader
-          .with(*loaders)
-          .load(path || paths)
+      texts = Array(path || paths)
+        .flatten
+        .map { |path| Langchain::Loader.new(path)&.load }
+        .compact
       add_texts(texts: texts)
     end
-    attr_reader :loaders
-    def add_loader(*loaders)
-      loaders.each { |loader| @loaders << loader }
-    end
   end
 end

data/lib/vectorsearch/pinecone.rb CHANGED Viewed

@@ -25,20 +25,22 @@ module Vectorsearch
     # Add a list of texts to the index
     # @param texts [Array] The list of texts to add
+    # @param namespace [String] The namespace to add the texts to
+    # @param metadata [Hash] The metadata to use for the texts
     # @return [Hash] The response from the server
-    def add_texts(texts:)
+    def add_texts(texts:, namespace: "", metadata: nil)
       vectors = texts.map do |text|
         {
           # TODO: Allows passing in your own IDs
           id: SecureRandom.uuid,
-          metadata: {content: text},
+          metadata: metadata || {content: text},
           values: llm_client.embed(text: text)
         }
       end
       index = client.index(index_name)
-      index.upsert(vectors: vectors)
+      index.upsert(vectors: vectors, namespace: namespace)
     end
     # Create the index with the default schema
@@ -54,40 +56,54 @@ module Vectorsearch
     # Search for similar texts
     # @param query [String] The text to search for
     # @param k [Integer] The number of results to return
+    # @param namespace [String] The namespace to search in
+    # @param filter [String] The filter to use
     # @return [Array] The list of results
     def similarity_search(
       query:,
-      k: 4
+      k: 4,
+      namespace: "",
+      filter: nil
     )
       embedding = llm_client.embed(text: query)
       similarity_search_by_vector(
         embedding: embedding,
-        k: k
+        k: k,
+        namespace: namespace,
+        filter: filter
       )
     end
     # Search for similar texts by embedding
     # @param embedding [Array] The embedding to search for
     # @param k [Integer] The number of results to return
+    # @param namespace [String] The namespace to search in
+    # @param filter [String] The filter to use
     # @return [Array] The list of results
-    def similarity_search_by_vector(embedding:, k: 4)
+    def similarity_search_by_vector(embedding:, k: 4, namespace: "", filter: nil)
       index = client.index(index_name)
-      response = index.query(
+      query_params = {
         vector: embedding,
+        namespace: namespace,
+        filter: filter,
         top_k: k,
         include_values: true,
         include_metadata: true
-      )
+      }.compact
+      response = index.query(query_params)
       response.dig("matches")
     end
     # Ask a question and return the answer
     # @param question [String] The question to ask
+    # @param namespace [String] The namespace to search in
+    # @param filter [String] The filter to use
     # @return [String] The answer to the question
-    def ask(question:)
-      search_results = similarity_search(query: question)
+    def ask(question:, namespace: "", filter: nil)
+      search_results = similarity_search(query: question, namespace: namespace, filter: filter)
       context = search_results.map do |result|
         result.dig("metadata").to_s

data/lib/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Langchain
-  VERSION = "0.3.11"
+  VERSION = "0.3.12"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: langchainrb
 version: !ruby/object:Gem::Version
-  version: 0.3.11
+  version: 0.3.12
 platform: ruby
 authors:
 - Andrei Bondarev
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-05-24 00:00:00.000000000 Z
+date: 2023-05-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dotenv-rails
@@ -288,18 +288,19 @@ files:
 - lib/agent/chain_of_thought_agent/chain_of_thought_agent_prompt.json
 - lib/dependency_helper.rb
 - lib/langchain.rb
+- lib/langchain/loader.rb
+- lib/langchain/processors/base.rb
+- lib/langchain/processors/docx.rb
+- lib/langchain/processors/html.rb
+- lib/langchain/processors/pdf.rb
+- lib/langchain/processors/text.rb
+- lib/langchainrb.rb
 - lib/llm/base.rb
 - lib/llm/cohere.rb
 - lib/llm/google_palm.rb
 - lib/llm/hugging_face.rb
 - lib/llm/openai.rb
 - lib/llm/replicate.rb
-- lib/loader.rb
-- lib/loaders/base.rb
-- lib/loaders/docx.rb
-- lib/loaders/html.rb
-- lib/loaders/pdf.rb
-- lib/loaders/text.rb
 - lib/prompt/base.rb
 - lib/prompt/few_shot_prompt_template.rb
 - lib/prompt/loading.rb

data/lib/loader.rb DELETED Viewed

@@ -1,26 +0,0 @@
-module Loader
-  def self.with(*loaders)
-    LoaderSet.new(loaders)
-  end
-  class LoaderSet
-    def initialize(loaders)
-      @loaders = Array(loaders)
-    end
-    def load(*paths)
-      Array(paths)
-        .flatten
-        .map { |path| first_loadable_loader(path)&.load }
-        .compact
-    end
-    def first_loadable_loader(path)
-      @loaders
-        .each do |loader_klass|
-          loader_instance = loader_klass.new(path)
-          return(loader_instance) if loader_instance.loadable?
-        end
-    end
-  end
-end

data/lib/loaders/base.rb DELETED Viewed

@@ -1,19 +0,0 @@
-# frozen_string_literal: true
-# TODO: Add chunking options to the loaders
-module Loaders
-  class Base
-    def self.load(path)
-      new.load(path)
-    end
-    def initialize(path)
-      @path = path
-    end
-    def loadable?
-      raise NotImplementedError
-    end
-  end
-end

data/lib/loaders/docx.rb DELETED Viewed

@@ -1,34 +0,0 @@
-# frozen_string_literal: true
-module Loaders
-  class Docx < Base
-    #
-    # This Loader parses Docx files into text.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::Docx.new("path/to/my.docx").load
-    #
-    # This parser is also invoked when you're adding data to a Vectorsearch DB:
-    # qdrant = Vectorsearch::Qdrant.new(...)
-    # path = Langchain.root.join("path/to/my.docx")
-    # qdrant.add_data(path: path)
-    #
-    def initialize(path)
-      depends_on "docx"
-      require "docx"
-      @path = path
-    end
-    # Check that the file is a `.docx` file
-    def loadable?
-      @path.to_s.end_with?(".docx")
-    end
-    def load
-      ::Docx::Document
-        .open(@path.to_s)
-        .text
-    end
-  end
-end

data/lib/loaders/html.rb DELETED Viewed

@@ -1,38 +0,0 @@
-# frozen_string_literal: true
-require "open-uri"
-module Loaders
-  class HTML < Base
-    # We only look for headings and paragraphs
-    TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
-    #
-    # This Loader parses URL into a text.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::URL.new("https://nokogiri.org/").load
-    #
-    def initialize(url)
-      depends_on "nokogiri"
-      require "nokogiri"
-      @url = url
-    end
-    # Check that url is a valid URL
-    def loadable?
-      !!(@url =~ URI::DEFAULT_PARSER.make_regexp)
-    end
-    def load
-      return unless response.status.first == "200"
-      doc = Nokogiri::HTML(response.read)
-      doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
-    end
-    def response
-      @response ||= URI.parse(@url).open
-    end
-  end
-end

data/lib/loaders/pdf.rb DELETED Viewed

@@ -1,36 +0,0 @@
-# frozen_string_literal: true
-module Loaders
-  class PDF < Base
-    #
-    # This Loader parses PDF files into text.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::PDF.new("path/to/my.pdf").load
-    #
-    # This parser is also invoked when you're adding data to a Vectorsearch DB:
-    # qdrant = Vectorsearch::Qdrant.new(...)
-    # path = Langchain.root.join("path/to/my.pdf")
-    # qdrant.add_data(path: path)
-    #
-    def initialize(path)
-      depends_on "pdf-reader"
-      require "pdf-reader"
-      @path = path
-    end
-    # Check that the file is a PDF file
-    def loadable?
-      @path.to_s.end_with?(".pdf")
-    end
-    def load
-      ::PDF::Reader
-        .new(@path)
-        .pages
-        .map(&:text)
-        .join("\n\n")
-    end
-  end
-end

data/lib/loaders/text.rb DELETED Viewed

@@ -1,24 +0,0 @@
-# frozen_string_literal: true
-module Loaders
-  class Text < Base
-    #
-    # This Loader parses .txt files.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::Text.new("path/to/my.txt").load
-    #
-    # This parser is also invoked when you're adding data to a Vectorsearch DB:
-    # qdrant = Vectorsearch::Qdrant.new(...)
-    # path = Langchain.root.join("path/to/my.txt")
-    # qdrant.add_data(path: path)
-    #
-    def loadable?
-      @path.to_s.end_with?(".txt")
-    end
-    def load
-      @path.read
-    end
-  end
-end