RubyGems - langchainrb - Versions diffs - 0.3.11 → 0.3.13 - Mend

langchainrb 0.3.11 → 0.3.13

Files changed (28) hide show

checksums.yaml +4 -4
data/.env.example +2 -1
data/CHANGELOG.md +14 -0
data/Gemfile.lock +11 -1
data/README.md +26 -6
data/examples/pdf_store_and_query_with_chroma.rb +2 -2
data/lib/langchain/loader.rb +80 -0
data/lib/langchain/processors/base.rb +14 -0
data/lib/langchain/processors/csv.rb +21 -0
data/lib/langchain/processors/docx.rb +24 -0
data/lib/langchain/processors/html.rb +28 -0
data/lib/langchain/processors/json.rb +17 -0
data/lib/langchain/processors/jsonl.rb +19 -0
data/lib/langchain/processors/pdf.rb +26 -0
data/lib/langchain/processors/text.rb +17 -0
data/lib/langchain.rb +14 -14
data/lib/langchainrb.rb +1 -0
data/lib/vectorsearch/base.rb +4 -12
data/lib/vectorsearch/pgvector.rb +100 -0
data/lib/vectorsearch/pinecone.rb +26 -10
data/lib/version.rb +1 -1
metadata +41 -8
data/lib/loader.rb +0 -26
data/lib/loaders/base.rb +0 -19
data/lib/loaders/docx.rb +0 -34
data/lib/loaders/html.rb +0 -38
data/lib/loaders/pdf.rb +0 -36
data/lib/loaders/text.rb +0 -24

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ff7f99d961b09e827df297ddb3144821c9103fd40eabb32688ca92588a73415c
-  data.tar.gz: bb83eaa99055cf45cceaccb18a84e9fd4ee3ea4a93a6a0c66e04ede43e5d4bc0
+  metadata.gz: 2ee811b2bac8fadea4d90c4212363a901829a4aac219da0f2a2dcbe7c6f59c5b
+  data.tar.gz: 8fa32e6df4aaf69cb6d29977913c1b8a30d6f65b777b1f90c8a7f504d869ca8f
 SHA512:
-  metadata.gz: 40e5362520220d3ffc1b4c29c3e430b051de334c2f281d9cb7d7549a93be40b26b379dbd35d0c91ccb5010c1a495a653e31768f1b7a95bc087059d59339fd1a7
-  data.tar.gz: 04f24944b590ee8b577419a95718ad6796bc4cdc34d52cf05e287806912c05ba8ace22e07181a8537892124881b65a8a221e1c75d8c6245231dd5660c6b4308c
+  metadata.gz: cbb7e0c975333248c01082a47f7096fb9d6807c3b7619424eb9348238008d7b4257518287d9358114bf4e3a589349520ebf71ace00bf1fe8906afd27e8b1418a
+  data.tar.gz: 759444abe0b17518c6ef31fed6980f6bc0d3d096606860c4d6fddb8baeda4e0a23fc3909e42eba0f32912a786abec76cac54384533db2787e05d741f0907fa1d

data/.env.example CHANGED Viewed

@@ -11,4 +11,5 @@ QDRANT_API_KEY=
 QDRANT_URL=
 SERPAPI_API_KEY=
 WEAVIATE_API_KEY=
-WEAVIATE_URL=
+WEAVIATE_URL=
+POSTGRES_URL=

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,19 @@
 ## [Unreleased]
+## [0.3.13] - 2023-05-26
+- 🔍 Vectorsearch
+  - Pgvector support
+- 🚚 Loaders
+  - CSV loader
+  - JSON loader
+  - JSONL loader
+## [0.3.12] - 2023-05-25
+- 🔍 Vectorsearch
+  - Introduce namespace support for Pinecone
+- 🚚 Loaders
+  - Loaders overhaul
 ## [0.3.11] - 2023-05-23
 - 🗣️ LLMs
   - Introducing support for Google PaLM (Pathways Language Model)

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    langchainrb (0.3.11)
+    langchainrb (0.3.13)
 GEM
   remote: https://rubygems.org/
@@ -148,9 +148,13 @@ GEM
     milvus (0.9.1)
       faraday (~> 1)
     mini_mime (1.1.2)
+    mini_portile2 (2.8.2)
     minitest (5.18.0)
     multi_xml (0.6.0)
     multipart-post (2.3.0)
+    nokogiri (1.14.3)
+      mini_portile2 (~> 2.8.0)
+      racc (~> 1.4)
     nokogiri (1.14.3-arm64-darwin)
       racc (~> 1.4)
     nokogiri (1.14.3-x86_64-darwin)
@@ -166,6 +170,8 @@ GEM
       hashery (~> 2.0)
       ruby-rc4
       ttfunk
+    pg (1.5.3)
+    pgvector (0.1.1)
     pinecone (0.1.71)
       dry-struct (~> 1.6.0)
       dry-validation (~> 1.10.0)
@@ -271,7 +277,9 @@ GEM
     zeitwerk (2.6.8)
 PLATFORMS
+  arm64-darwin-21
   arm64-darwin-22
+  ruby
   x86_64-darwin-19
   x86_64-darwin-22
   x86_64-linux
@@ -289,6 +297,8 @@ DEPENDENCIES
   milvus (~> 0.9.0)
   nokogiri (~> 1.13)
   pdf-reader (~> 1.4)
+  pg (~> 1.5)
+  pgvector (< 0.2)
   pinecone (~> 0.1.6)
   pry-byebug (~> 3.10.0)
   qdrant-ruby (~> 0.9.0)

data/README.md CHANGED Viewed

@@ -268,12 +268,32 @@ agent.run(question: "How many full soccer fields would be needed to cover the di
 Need to read data from various sources? Load it up.
-| Name | Class         | Gem Requirements             |
-| ---- | ------------- | :--------------------------: |
-| docx | Loaders::Docx | `gem "docx", "~> 0.8.0"`     |
-| html | Loaders::HTML | `gem "nokogiri", "~> 1.13"`  |
-| pdf  | Loaders::PDF  | `gem "pdf-reader", "~> 1.4"` |
-| text | Loaders::Text |                              |
+##### Usage
+Just call `Langchan::Loader.load` with the path to the file or a URL you want to load.
+```ruby
+Langchaing::Loader.load('/path/to/file.pdf')
+```
+or
+```ruby
+Langchain::Loader.load('https://www.example.com/file.pdf')
+```
+##### Supported Formats
+| Format | Pocessor                     |       Gem Requirements       |
+| ------ | ---------------------------- | :--------------------------: |
+| docx   | Langchain::Processors::Docx  |   `gem "docx", "~> 0.8.0"`   |
+| html   | Langchain::Processors::HTML  | `gem "nokogiri", "~> 1.13"`  |
+| pdf    | Langchain::Processors::PDF   | `gem "pdf-reader", "~> 1.4"` |
+| text   | Langchain::Processors::Text  |                              |
+| JSON   | Langchain::Processors::JSON  |                              |
+| JSONL  | Langchain::Processors::JSONL |                              |
+| csv    | Langchain::Processors::CSV   |                              |
 ## Examples
 Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)

data/examples/pdf_store_and_query_with_chroma.rb CHANGED Viewed

@@ -26,8 +26,8 @@ docs = [
 ]
 # Add data to the index. Weaviate will use OpenAI to generate embeddings behind the scene.
-chroma.add_texts(
-  texts: docs
+chroma.add_data(
+  paths: docs
 )
 # Query your data

data/lib/langchain/loader.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# frozen_string_literal: true
+require "open-uri"
+module Langchain
+  class Loader
+    class FileNotFound < StandardError; end
+    class UnknownFormatError < StandardError; end
+    URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
+    # Load data from a file or url
+    # Equivalent to Langchain::Loader.new(path).load
+    # @param path [String | Pathname] path to file or url
+    # @return [String] file content
+    def self.load(path)
+      new(path).load
+    end
+    # Initialize Langchain::Loader
+    # @param path [String | Pathname] path to file or url
+    # @return [Langchain::Loader] loader instance
+    def initialize(path)
+      @path = path
+    end
+    # Check if path is url
+    # @return [Boolean] true if path is url
+    def url?
+      return false if @path.is_a?(Pathname)
+      !!(@path =~ URI_REGEX)
+    end
+    # Load data from a file or url
+    # @return [String] file content
+    def load
+      url? ? from_url(@path) : from_path(@path)
+    end
+    private
+    def from_url(url)
+      process do
+        data = URI.parse(url).open
+        processor = find_processor(:CONTENT_TYPES, data.content_type)
+        [data, processor]
+      end
+    end
+    def from_path(path)
+      raise FileNotFound unless File.exist?(path)
+      process do
+        [File.open(path), find_processor(:EXTENSIONS, File.extname(path))]
+      end
+    end
+    def process(&block)
+      data, processor = yield
+      raise UnknownFormatError unless processor
+      Langchain::Processors.const_get(processor).new.parse(data)
+    end
+    def find_processor(constant, value)
+      processors.find { |klass| processor_matches? "#{klass}::#{constant}", value }
+    end
+    def processor_matches?(constant, value)
+      Langchain::Processors.const_get(constant).include?(value)
+    end
+    def processors
+      Langchain::Processors.constants
+    end
+  end
+end

data/lib/langchain/processors/base.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class Base
+      EXTENSIONS = []
+      CONTENT_TYPES = []
+      def parse(data)
+        raise NotImplementedError
+      end
+    end
+  end
+end

data/lib/langchain/processors/csv.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+require "csv"
+module Langchain
+  module Processors
+    class CSV < Base
+      EXTENSIONS = [".csv"]
+      CONTENT_TYPES = ["text/csv"]
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [Array of Hash]
+      def parse(data)
+        ::CSV.new(data.read).map do |row|
+          row.map(&:strip)
+        end
+      end
+    end
+  end
+end

data/lib/langchain/processors/docx.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class Docx < Base
+      EXTENSIONS = [".docx"]
+      CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+      def initialize
+        depends_on "docx"
+        require "docx"
+      end
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        ::Docx::Document
+          .open(StringIO.new(data.read))
+          .text
+      end
+    end
+  end
+end

data/lib/langchain/processors/html.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class HTML < Base
+      EXTENSIONS = [".html", ".htm"]
+      CONTENT_TYPES = ["text/html"]
+      # We only look for headings and paragraphs
+      TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
+      def initialize
+        depends_on "nokogiri"
+        require "nokogiri"
+      end
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        Nokogiri::HTML(data.read)
+          .css(TEXT_CONTENT_TAGS.join(","))
+          .map(&:inner_text)
+          .join("\n\n")
+      end
+    end
+  end
+end

data/lib/langchain/processors/json.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class JSON < Base
+      EXTENSIONS = [".json"]
+      CONTENT_TYPES = ["application/json"]
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [Hash]
+      def parse(data)
+        ::JSON.parse(data.read)
+      end
+    end
+  end
+end

data/lib/langchain/processors/jsonl.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class JSONL < Base
+      EXTENSIONS = [".jsonl"]
+      CONTENT_TYPES = ["application/jsonl", "application/json-lines", "application/jsonlines"]
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [Array of Hash]
+      def parse(data)
+        data.read.lines.map do |line|
+          ::JSON.parse(line)
+        end
+      end
+    end
+  end
+end

data/lib/langchain/processors/pdf.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class PDF < Base
+      EXTENSIONS = [".pdf"]
+      CONTENT_TYPES = ["application/pdf"]
+      def initialize
+        depends_on "pdf-reader"
+        require "pdf-reader"
+      end
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        ::PDF::Reader
+          .new(StringIO.new(data.read))
+          .pages
+          .map(&:text)
+          .join("\n\n")
+      end
+    end
+  end
+end

data/lib/langchain/processors/text.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+module Langchain
+  module Processors
+    class Text < Base
+      EXTENSIONS = [".txt"]
+      CONTENT_TYPES = ["text/plain"]
+      # Parse the document and return the text
+      # @param [File] data
+      # @return [String]
+      def parse(data)
+        data.read
+      end
+    end
+  end
+end

data/lib/langchain.rb CHANGED Viewed

@@ -7,7 +7,6 @@ require_relative "./version"
 require_relative "./dependency_helper"
 module Langchain
   class << self
-    attr_accessor :default_loaders
     attr_accessor :logger
     attr_reader :root
@@ -16,6 +15,19 @@ module Langchain
   @logger ||= ::Logger.new($stdout, level: :warn, formatter: ->(severity, datetime, progname, msg) { "[LangChain.rb] #{msg}\n" })
   @root = Pathname.new(__dir__)
+  autoload :Loader, "langchain/loader"
+  module Processors
+    autoload :Base, "langchain/processors/base"
+    autoload :CSV, "langchain/processors/csv"
+    autoload :Docx, "langchain/processors/docx"
+    autoload :HTML, "langchain/processors/html"
+    autoload :JSON, "langchain/processors/json"
+    autoload :JSONL, "langchain/processors/jsonl"
+    autoload :PDF, "langchain/processors/pdf"
+    autoload :Text, "langchain/processors/text"
+  end
 end
 module Agent
@@ -28,6 +40,7 @@ module Vectorsearch
   autoload :Chroma, "vectorsearch/chroma"
   autoload :Milvus, "vectorsearch/milvus"
   autoload :Pinecone, "vectorsearch/pinecone"
+  autoload :Pgvector, "vectorsearch/pgvector"
   autoload :Qdrant, "vectorsearch/qdrant"
   autoload :Weaviate, "vectorsearch/weaviate"
 end
@@ -55,16 +68,3 @@ module Tool
   autoload :SerpApi, "tool/serp_api"
   autoload :Wikipedia, "tool/wikipedia"
 end
-module Loaders
-  autoload :Base, "loaders/base"
-  autoload :Docx, "loaders/docx"
-  autoload :PDF, "loaders/pdf"
-  autoload :Text, "loaders/text"
-  autoload :HTML, "loaders/html"
-end
-autoload :Loader, "loader"
-# Load the default Loaders
-Langchain.default_loaders ||= [::Loaders::Text, ::Loaders::PDF, ::Loaders::Docx]

data/lib/langchainrb.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "langchain"

data/lib/vectorsearch/base.rb CHANGED Viewed

@@ -19,8 +19,6 @@ module Vectorsearch
       @llm_api_key = llm_api_key
       @llm_client = LLM.const_get(LLM::Base::LLMS.fetch(llm)).new(api_key: llm_api_key)
-      @loaders = Langchain.default_loaders
     end
     # Method supported by Vectorsearch DB to create a default schema
@@ -74,18 +72,12 @@ module Vectorsearch
       raise ArgumentError, "Either path or paths must be provided" if path.nil? && paths.nil?
       raise ArgumentError, "Either path or paths must be provided, not both" if !path.nil? && !paths.nil?
-      texts =
-        Loader
-          .with(*loaders)
-          .load(path || paths)
+      texts = Array(path || paths)
+        .flatten
+        .map { |path| Langchain::Loader.new(path)&.load }
+        .compact
       add_texts(texts: texts)
     end
-    attr_reader :loaders
-    def add_loader(*loaders)
-      loaders.each { |loader| @loaders << loader }
-    end
   end
 end

data/lib/vectorsearch/pgvector.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# frozen_string_literal: true
+module Vectorsearch
+  # The PostgreSQL vector search adapter
+  class Pgvector < Base
+    # @param url [String] The URL of the PostgreSQL database
+    # @param index_name [String] The name of the table to use for the index
+    # @param llm [String] The URL of the Language Layer API
+    # @param llm_api_key [String] The API key for the Language Layer API
+    # @param api_key [String] The API key for the Vectorsearch DB (not used for PostgreSQL)
+    def initialize(url:, index_name:, llm:, llm_api_key:, api_key: nil)
+      require "pg"
+      require "pgvector"
+      @client = ::PG.connect(url)
+      registry = ::PG::BasicTypeRegistry.new.define_default_types
+      ::Pgvector::PG.register_vector(registry)
+      @client.type_map_for_results = PG::BasicTypeMapForResults.new(@client, registry: registry)
+      @index_name = index_name
+      super(llm: llm, llm_api_key: llm_api_key)
+    end
+    # Add a list of texts to the index
+    # @param texts [Array<String>] The texts to add to the index
+    # @return [PG::Result] The response from the database
+    def add_texts(texts:)
+      data = texts.flat_map do |text|
+        [text, llm_client.embed(text: text)]
+      end
+      values = texts.length.times.map { |i| "($#{2 * i + 1}, $#{2 * i + 2})" }.join(",")
+      client.exec_params(
+        "INSERT INTO #{@index_name} (content, vectors) VALUES #{values};",
+        data
+      )
+    end
+    # Create default schema
+    # @return [PG::Result] The response from the database
+    def create_default_schema
+      client.exec("CREATE EXTENSION IF NOT EXISTS vector;")
+      client.exec(
+        <<~SQL
+          CREATE TABLE IF NOT EXISTS #{@index_name} (
+            id serial PRIMARY KEY,
+            content TEXT,
+            vectors VECTOR(#{default_dimension})
+          );
+        SQL
+      )
+    end
+    # Search for similar texts in the index
+    # @param query [String] The text to search for
+    # @param k [Integer] The number of top results to return
+    # @return [Array<Hash>] The results of the search
+    def similarity_search(query:, k: 4)
+      embedding = llm_client.embed(text: query)
+      similarity_search_by_vector(
+        embedding: embedding,
+        k: k
+      )
+    end
+    # Search for similar texts in the index by the passed in vector.
+    # You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.
+    # @param embedding [Array<Float>] The vector to search for
+    # @param k [Integer] The number of top results to return
+    # @return [Array<Hash>] The results of the search
+    def similarity_search_by_vector(embedding:, k: 4)
+      result = client.transaction do |conn|
+        conn.exec("SET LOCAL ivfflat.probes = 10;")
+        query = <<~SQL
+          SELECT id, content FROM #{@index_name} ORDER BY vectors <-> $1 ASC LIMIT $2;
+        SQL
+        conn.exec_params(query, [embedding, k])
+      end
+      result.to_a
+    end
+    # Ask a question and return the answer
+    # @param question [String] The question to ask
+    # @return [String] The answer to the question
+    def ask(question:)
+      search_results = similarity_search(query: question)
+      context = search_results.map do |result|
+        result["content"].to_s
+      end
+      context = context.join("\n---\n")
+      prompt = generate_prompt(question: question, context: context)
+      llm_client.chat(prompt: prompt)
+    end
+  end
+end

data/lib/vectorsearch/pinecone.rb CHANGED Viewed

@@ -25,20 +25,22 @@ module Vectorsearch
     # Add a list of texts to the index
     # @param texts [Array] The list of texts to add
+    # @param namespace [String] The namespace to add the texts to
+    # @param metadata [Hash] The metadata to use for the texts
     # @return [Hash] The response from the server
-    def add_texts(texts:)
+    def add_texts(texts:, namespace: "", metadata: nil)
       vectors = texts.map do |text|
         {
           # TODO: Allows passing in your own IDs
           id: SecureRandom.uuid,
-          metadata: {content: text},
+          metadata: metadata || {content: text},
           values: llm_client.embed(text: text)
         }
       end
       index = client.index(index_name)
-      index.upsert(vectors: vectors)
+      index.upsert(vectors: vectors, namespace: namespace)
     end
     # Create the index with the default schema
@@ -54,40 +56,54 @@ module Vectorsearch
     # Search for similar texts
     # @param query [String] The text to search for
     # @param k [Integer] The number of results to return
+    # @param namespace [String] The namespace to search in
+    # @param filter [String] The filter to use
     # @return [Array] The list of results
     def similarity_search(
       query:,
-      k: 4
+      k: 4,
+      namespace: "",
+      filter: nil
     )
       embedding = llm_client.embed(text: query)
       similarity_search_by_vector(
         embedding: embedding,
-        k: k
+        k: k,
+        namespace: namespace,
+        filter: filter
       )
     end
     # Search for similar texts by embedding
     # @param embedding [Array] The embedding to search for
     # @param k [Integer] The number of results to return
+    # @param namespace [String] The namespace to search in
+    # @param filter [String] The filter to use
     # @return [Array] The list of results
-    def similarity_search_by_vector(embedding:, k: 4)
+    def similarity_search_by_vector(embedding:, k: 4, namespace: "", filter: nil)
       index = client.index(index_name)
-      response = index.query(
+      query_params = {
         vector: embedding,
+        namespace: namespace,
+        filter: filter,
         top_k: k,
         include_values: true,
         include_metadata: true
-      )
+      }.compact
+      response = index.query(query_params)
       response.dig("matches")
     end
     # Ask a question and return the answer
     # @param question [String] The question to ask
+    # @param namespace [String] The namespace to search in
+    # @param filter [String] The filter to use
     # @return [String] The answer to the question
-    def ask(question:)
-      search_results = similarity_search(query: question)
+    def ask(question:, namespace: "", filter: nil)
+      search_results = similarity_search(query: question, namespace: namespace, filter: filter)
       context = search_results.map do |result|
         result.dig("metadata").to_s

data/lib/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Langchain
-  VERSION = "0.3.11"
+  VERSION = "0.3.13"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: langchainrb
 version: !ruby/object:Gem::Version
-  version: 0.3.11
+  version: 0.3.13
 platform: ruby
 authors:
 - Andrei Bondarev
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-05-24 00:00:00.000000000 Z
+date: 2023-05-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dotenv-rails
@@ -164,6 +164,34 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.13'
+- !ruby/object:Gem::Dependency
+  name: pg
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+- !ruby/object:Gem::Dependency
+  name: pgvector
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: '0.2'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: '0.2'
 - !ruby/object:Gem::Dependency
   name: pdf-reader
   requirement: !ruby/object:Gem::Requirement
@@ -288,18 +316,22 @@ files:
 - lib/agent/chain_of_thought_agent/chain_of_thought_agent_prompt.json
 - lib/dependency_helper.rb
 - lib/langchain.rb
+- lib/langchain/loader.rb
+- lib/langchain/processors/base.rb
+- lib/langchain/processors/csv.rb
+- lib/langchain/processors/docx.rb
+- lib/langchain/processors/html.rb
+- lib/langchain/processors/json.rb
+- lib/langchain/processors/jsonl.rb
+- lib/langchain/processors/pdf.rb
+- lib/langchain/processors/text.rb
+- lib/langchainrb.rb
 - lib/llm/base.rb
 - lib/llm/cohere.rb
 - lib/llm/google_palm.rb
 - lib/llm/hugging_face.rb
 - lib/llm/openai.rb
 - lib/llm/replicate.rb
-- lib/loader.rb
-- lib/loaders/base.rb
-- lib/loaders/docx.rb
-- lib/loaders/html.rb
-- lib/loaders/pdf.rb
-- lib/loaders/text.rb
 - lib/prompt/base.rb
 - lib/prompt/few_shot_prompt_template.rb
 - lib/prompt/loading.rb
@@ -311,6 +343,7 @@ files:
 - lib/vectorsearch/base.rb
 - lib/vectorsearch/chroma.rb
 - lib/vectorsearch/milvus.rb
+- lib/vectorsearch/pgvector.rb
 - lib/vectorsearch/pinecone.rb
 - lib/vectorsearch/qdrant.rb
 - lib/vectorsearch/weaviate.rb

data/lib/loader.rb DELETED Viewed

@@ -1,26 +0,0 @@
-module Loader
-  def self.with(*loaders)
-    LoaderSet.new(loaders)
-  end
-  class LoaderSet
-    def initialize(loaders)
-      @loaders = Array(loaders)
-    end
-    def load(*paths)
-      Array(paths)
-        .flatten
-        .map { |path| first_loadable_loader(path)&.load }
-        .compact
-    end
-    def first_loadable_loader(path)
-      @loaders
-        .each do |loader_klass|
-          loader_instance = loader_klass.new(path)
-          return(loader_instance) if loader_instance.loadable?
-        end
-    end
-  end
-end

data/lib/loaders/base.rb DELETED Viewed

@@ -1,19 +0,0 @@
-# frozen_string_literal: true
-# TODO: Add chunking options to the loaders
-module Loaders
-  class Base
-    def self.load(path)
-      new.load(path)
-    end
-    def initialize(path)
-      @path = path
-    end
-    def loadable?
-      raise NotImplementedError
-    end
-  end
-end

data/lib/loaders/docx.rb DELETED Viewed

@@ -1,34 +0,0 @@
-# frozen_string_literal: true
-module Loaders
-  class Docx < Base
-    #
-    # This Loader parses Docx files into text.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::Docx.new("path/to/my.docx").load
-    #
-    # This parser is also invoked when you're adding data to a Vectorsearch DB:
-    # qdrant = Vectorsearch::Qdrant.new(...)
-    # path = Langchain.root.join("path/to/my.docx")
-    # qdrant.add_data(path: path)
-    #
-    def initialize(path)
-      depends_on "docx"
-      require "docx"
-      @path = path
-    end
-    # Check that the file is a `.docx` file
-    def loadable?
-      @path.to_s.end_with?(".docx")
-    end
-    def load
-      ::Docx::Document
-        .open(@path.to_s)
-        .text
-    end
-  end
-end

data/lib/loaders/html.rb DELETED Viewed

@@ -1,38 +0,0 @@
-# frozen_string_literal: true
-require "open-uri"
-module Loaders
-  class HTML < Base
-    # We only look for headings and paragraphs
-    TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
-    #
-    # This Loader parses URL into a text.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::URL.new("https://nokogiri.org/").load
-    #
-    def initialize(url)
-      depends_on "nokogiri"
-      require "nokogiri"
-      @url = url
-    end
-    # Check that url is a valid URL
-    def loadable?
-      !!(@url =~ URI::DEFAULT_PARSER.make_regexp)
-    end
-    def load
-      return unless response.status.first == "200"
-      doc = Nokogiri::HTML(response.read)
-      doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
-    end
-    def response
-      @response ||= URI.parse(@url).open
-    end
-  end
-end

data/lib/loaders/pdf.rb DELETED Viewed

@@ -1,36 +0,0 @@
-# frozen_string_literal: true
-module Loaders
-  class PDF < Base
-    #
-    # This Loader parses PDF files into text.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::PDF.new("path/to/my.pdf").load
-    #
-    # This parser is also invoked when you're adding data to a Vectorsearch DB:
-    # qdrant = Vectorsearch::Qdrant.new(...)
-    # path = Langchain.root.join("path/to/my.pdf")
-    # qdrant.add_data(path: path)
-    #
-    def initialize(path)
-      depends_on "pdf-reader"
-      require "pdf-reader"
-      @path = path
-    end
-    # Check that the file is a PDF file
-    def loadable?
-      @path.to_s.end_with?(".pdf")
-    end
-    def load
-      ::PDF::Reader
-        .new(@path)
-        .pages
-        .map(&:text)
-        .join("\n\n")
-    end
-  end
-end

data/lib/loaders/text.rb DELETED Viewed

@@ -1,24 +0,0 @@
-# frozen_string_literal: true
-module Loaders
-  class Text < Base
-    #
-    # This Loader parses .txt files.
-    # If you'd like to use it directly you can do so like this:
-    # Loaders::Text.new("path/to/my.txt").load
-    #
-    # This parser is also invoked when you're adding data to a Vectorsearch DB:
-    # qdrant = Vectorsearch::Qdrant.new(...)
-    # path = Langchain.root.join("path/to/my.txt")
-    # qdrant.add_data(path: path)
-    #
-    def loadable?
-      @path.to_s.end_with?(".txt")
-    end
-    def load
-      @path.read
-    end
-  end
-end