RubyGems - loader-ruby - Versions diffs - 0.1.1 - Mend

loader-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +14 -0
data/LICENSE +21 -0
data/README.md +38 -0
data/Rakefile +11 -0
data/lib/loader_ruby/configuration.rb +15 -0
data/lib/loader_ruby/document.rb +43 -0
data/lib/loader_ruby/encoding_detector.rb +61 -0
data/lib/loader_ruby/error.rb +11 -0
data/lib/loader_ruby/html_extractor.rb +32 -0
data/lib/loader_ruby/loaders/base.rb +51 -0
data/lib/loader_ruby/loaders/csv.rb +55 -0
data/lib/loader_ruby/loaders/docx.rb +33 -0
data/lib/loader_ruby/loaders/html.rb +34 -0
data/lib/loader_ruby/loaders/pdf.rb +36 -0
data/lib/loader_ruby/loaders/text.rb +27 -0
data/lib/loader_ruby/loaders/web.rb +94 -0
data/lib/loader_ruby/version.rb +5 -0
data/lib/loader_ruby.rb +59 -0
data/loader-ruby.gemspec +35 -0
metadata +119 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 663d36e070a5a41ba1c34ff5cc7d734b34f92eebbb7a5e104d7e2ceb7f4854f0
+  data.tar.gz: fbea2f56dce6dfca84ff898fa3626c0ec56092d3466fdc82f0327f9cd937a5ba
+SHA512:
+  metadata.gz: 9cb3f7f935f3779a68524f11af2c734d2219b68c1a7d3b8c0cd841a71052452146d55819d6e91938da7d8904b7d028899beb5b835cd9354be621f6e2ad9de344
+  data.tar.gz: 59df44fe24d1e8a7d15bf48e9da9f72fca101a0ea8ae81b46c9cf9bfc45b86c65dfa5388def61a4ec86bc7a0d7e0b354bd31a214490cddbd75fb78e8b8d928fd

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,14 @@
+# Changelog
+## 0.1.0 (2026-03-09)
+- Initial release
+- Text/Markdown loader
+- PDF loader (via pdf-reader gem)
+- DOCX loader (via docx gem)
+- CSV/TSV loader with row-as-document support
+- HTML loader (via nokogiri gem)
+- Web page loader with HTTP fetching
+- Auto-detection of file format
+- Normalized Document result object
+- Configuration DSL

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Johannes Dwi Cahyo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,38 @@
+# loader-ruby
+Document loader library for Ruby RAG pipelines. Extracts text from PDF, DOCX, CSV, HTML, and web pages.
+## Installation
+```ruby
+gem "loader-ruby", "~> 0.1"
+# Optional dependencies for specific formats:
+gem "pdf-reader"  # PDF support
+gem "nokogiri"    # HTML/web support
+gem "docx"        # DOCX support
+```
+## Usage
+```ruby
+require "loader_ruby"
+doc = LoaderRuby.load("document.pdf")
+doc.content   # => extracted text
+doc.metadata  # => { source: "document.pdf", format: :pdf, pages: 12, ... }
+doc = LoaderRuby.load("notes.md")
+doc = LoaderRuby.load("data.csv")
+docs = LoaderRuby::Loaders::Csv.new.load("data.csv", row_as_document: true)
+doc = LoaderRuby.load("https://example.com/page")
+docs = LoaderRuby.load_batch(["file1.pdf", "file2.docx"])
+```
+## License
+MIT

data/Rakefile ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+require "rake/testtask"
+Rake::TestTask.new(:test) do |t|
+  t.libs << "test"
+  t.libs << "lib"
+  t.test_files = FileList["test/**/test_*.rb"]
+end
+task default: :test

data/lib/loader_ruby/configuration.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+module LoaderRuby
+  class Configuration
+    attr_accessor :default_encoding, :max_file_size, :http_timeout,
+                  :web_user_agent
+    def initialize
+      @default_encoding = "UTF-8"
+      @max_file_size = 100 * 1024 * 1024
+      @http_timeout = 30
+      @web_user_agent = "LoaderRuby/#{VERSION}"
+    end
+  end
+end

data/lib/loader_ruby/document.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+module LoaderRuby
+  class Document
+    attr_reader :content, :metadata
+    def initialize(content:, metadata: {})
+      @content = content
+      @metadata = metadata
+    end
+    def source
+      @metadata[:source]
+    end
+    def format
+      @metadata[:format]
+    end
+    def pages
+      @metadata[:pages]
+    end
+    def size
+      @content.length
+    end
+    def empty?
+      @content.nil? || @content.strip.empty?
+    end
+    def to_h
+      {
+        content: @content,
+        metadata: @metadata
+      }
+    end
+    def to_s
+      "Document(source: #{source}, format: #{self.format}, size: #{size})"
+    end
+  end
+end

data/lib/loader_ruby/encoding_detector.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+module LoaderRuby
+  # Detects file encoding from BOM or content-type header and transcodes to UTF-8.
+  module EncodingDetector
+    BOM_MAP = {
+      "\xEF\xBB\xBF".b => "UTF-8",
+      "\xFF\xFE".b => "UTF-16LE",
+      "\xFE\xFF".b => "UTF-16BE",
+      "\xFF\xFE\x00\x00".b => "UTF-32LE",
+      "\x00\x00\xFE\xFF".b => "UTF-32BE"
+    }.freeze
+    private
+    # Detect encoding from BOM bytes at the start of raw content.
+    def detect_encoding_from_bom(raw_bytes)
+      # Check 4-byte BOMs first, then 3-byte, then 2-byte
+      if raw_bytes.bytesize >= 4
+        bom4 = raw_bytes.byteslice(0, 4)
+        return BOM_MAP[bom4] if BOM_MAP.key?(bom4)
+      end
+      if raw_bytes.bytesize >= 3
+        bom3 = raw_bytes.byteslice(0, 3)
+        return BOM_MAP[bom3] if BOM_MAP.key?(bom3)
+      end
+      if raw_bytes.bytesize >= 2
+        bom2 = raw_bytes.byteslice(0, 2)
+        return BOM_MAP[bom2] if BOM_MAP.key?(bom2)
+      end
+      nil
+    end
+    # Detect encoding from a Content-Type header value, e.g. "text/html; charset=iso-8859-1"
+    def detect_encoding_from_content_type(content_type)
+      return nil unless content_type
+      if content_type =~ /charset=([^\s;]+)/i
+        $1.strip
+      end
+    end
+    # Transcode content to UTF-8 from the detected or specified encoding.
+    # Returns a UTF-8 encoded string with invalid/undefined bytes replaced.
+    def transcode_to_utf8(content, source_encoding)
+      return content if source_encoding.nil?
+      normalized = source_encoding.upcase.strip
+      return content if normalized == "UTF-8" && content.encoding == ::Encoding::UTF_8 && content.valid_encoding?
+      begin
+        content.encode("UTF-8", source_encoding, invalid: :replace, undef: :replace)
+      rescue ::EncodingError => e
+        raise LoaderRuby::EncodingError, "Failed to transcode from #{source_encoding} to UTF-8: #{e.message}"
+      end
+    end
+  end
+end

data/lib/loader_ruby/error.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+module LoaderRuby
+  class Error < StandardError; end
+  class FileNotFoundError < Error; end
+  class UnsupportedFormatError < Error; end
+  class FileTooLargeError < Error; end
+  class DependencyMissingError < Error; end
+  class TooManyRedirectsError < Error; end
+  class EncodingError < Error; end
+end

data/lib/loader_ruby/html_extractor.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+module LoaderRuby
+  # Shared HTML-to-text extraction logic used by both Html and Web loaders.
+  module HtmlExtractor
+    REMOVE_SELECTORS = "script, style, nav, footer, header"
+    private
+    def require_nokogiri!
+      require "nokogiri"
+    rescue LoadError
+      raise DependencyMissingError,
+        "nokogiri gem is required for HTML loading. Add `gem 'nokogiri'` to your Gemfile."
+    end
+    def parse_html(html)
+      doc = Nokogiri::HTML(html)
+      doc.css(REMOVE_SELECTORS).remove
+      doc
+    end
+    def extract_title(doc)
+      doc.at_css("title")&.text&.strip
+    end
+    def extract_text(doc)
+      body = doc.at_css("body") || doc
+      body.text.gsub(/\s+/, " ").strip
+    end
+  end
+end

data/lib/loader_ruby/loaders/base.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# frozen_string_literal: true
+require "uri"
+module LoaderRuby
+  module Loaders
+    class Base
+      def load(source, **opts)
+        raise NotImplementedError, "#{self.class}#load not implemented"
+      end
+      private
+      def validate_path!(path)
+        raise ArgumentError, "path cannot be nil" if path.nil?
+        raise ArgumentError, "path cannot be empty" if path.is_a?(String) && path.strip.empty?
+      end
+      def validate_url!(url)
+        raise ArgumentError, "URL cannot be nil" if url.nil?
+        raise ArgumentError, "URL cannot be empty" if url.is_a?(String) && url.strip.empty?
+        uri = URI.parse(url)
+        unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
+          raise ArgumentError, "Invalid URL: #{url}"
+        end
+      end
+      def check_file_exists!(path)
+        validate_path!(path)
+        raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
+      end
+      def check_file_size!(path)
+        max = LoaderRuby.configuration.max_file_size
+        size = File.size(path)
+        if size > max
+          raise FileTooLargeError, "File too large: #{size} bytes (max: #{max})"
+        end
+      end
+      def build_metadata(source, format:, **extra)
+        {
+          source: source,
+          format: format,
+          loaded_at: Time.now.iso8601
+        }.merge(extra)
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/csv.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+require "csv"
+module LoaderRuby
+  module Loaders
+    class Csv < Base
+      EXTENSIONS = %w[.csv .tsv].freeze
+      def load(path, row_as_document: false, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        separator = path.end_with?(".tsv") ? "\t" : ","
+        table = ::CSV.read(path, headers: true, col_sep: separator)
+        if row_as_document
+          load_rows_as_documents(path, table)
+        else
+          load_as_single_document(path, table)
+        end
+      end
+      private
+      def load_as_single_document(path, table)
+        content = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(", ") }.join("\n")
+        Document.new(
+          content: content,
+          metadata: build_metadata(path,
+            format: :csv,
+            rows: table.size,
+            headers: table.headers
+          )
+        )
+      end
+      def load_rows_as_documents(path, table)
+        table.map.with_index do |row, i|
+          content = row.to_h.map { |k, v| "#{k}: #{v}" }.join("\n")
+          Document.new(
+            content: content,
+            metadata: build_metadata(path,
+              format: :csv,
+              row_index: i,
+              headers: table.headers
+            )
+          )
+        end
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/docx.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Docx < Base
+      EXTENSIONS = %w[.docx].freeze
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        begin
+          require "docx"
+        rescue LoadError
+          raise DependencyMissingError,
+            "docx gem is required for DOCX loading. Add `gem 'docx'` to your Gemfile."
+        end
+        doc = ::Docx::Document.open(path)
+        paragraphs = doc.paragraphs.map(&:text)
+        content = paragraphs.join("\n")
+        Document.new(
+          content: content,
+          metadata: build_metadata(path,
+            format: :docx,
+            paragraphs: paragraphs.size
+          )
+        )
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/html.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Html < Base
+      include HtmlExtractor
+      include EncodingDetector
+      EXTENSIONS = %w[.html .htm].freeze
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        require_nokogiri!
+        raw = File.binread(path)
+        detected = detect_encoding_from_bom(raw)
+        html = transcode_to_utf8(raw, detected || "UTF-8")
+        doc = parse_html(html)
+        title = extract_title(doc)
+        content = extract_text(doc)
+        Document.new(
+          content: content,
+          metadata: build_metadata(path,
+            format: :html,
+            title: title
+          )
+        )
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/pdf.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Pdf < Base
+      EXTENSIONS = %w[.pdf].freeze
+      def load(path, password: nil, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        begin
+          require "pdf-reader"
+        rescue LoadError
+          raise DependencyMissingError,
+            "pdf-reader gem is required for PDF loading. Add `gem 'pdf-reader'` to your Gemfile."
+        end
+        reader_opts = {}
+        reader_opts[:password] = password if password
+        reader = PDF::Reader.new(path, **reader_opts)
+        pages = reader.pages.map(&:text)
+        content = pages.join("\n\n")
+        Document.new(
+          content: content,
+          metadata: build_metadata(path,
+            format: :pdf,
+            pages: reader.page_count,
+            info: reader.info
+          )
+        )
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/text.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Text < Base
+      include EncodingDetector
+      EXTENSIONS = %w[.txt .md .markdown .text .log .rst].freeze
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        explicit_encoding = opts[:encoding]
+        raw = File.binread(path)
+        detected = explicit_encoding || detect_encoding_from_bom(raw) || LoaderRuby.configuration.default_encoding
+        content = transcode_to_utf8(raw, detected)
+        Document.new(
+          content: content,
+          metadata: build_metadata(path, format: :text, encoding: detected)
+        )
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/web.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require "net/http"
+require "uri"
+require "set"
+module LoaderRuby
+  module Loaders
+    class Web < Base
+      include HtmlExtractor
+      include EncodingDetector
+      DEFAULT_MAX_REDIRECTS = 5
+      def load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts)
+        validate_url!(url)
+        require_nokogiri!
+        html, content_type = fetch(url, max_redirects: max_redirects)
+        detected = detect_encoding_from_content_type(content_type) ||
+                   detect_encoding_from_bom(html.b)
+        html = transcode_to_utf8(html, detected) if detected
+        doc = parse_html(html)
+        title = extract_title(doc)
+        content = extract_text(doc)
+        Document.new(
+          content: content,
+          metadata: build_metadata(url,
+            format: :web,
+            title: title
+          )
+        )
+      end
+      def crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS)
+        visited = Set.new
+        queue = [start_url]
+        documents = []
+        while queue.any? && documents.size < max_pages
+          url = queue.shift
+          next if visited.include?(url)
+          visited << url
+          begin
+            doc = load(url, max_redirects: max_redirects)
+            documents << doc
+          rescue StandardError
+            next
+          end
+        end
+        documents
+      end
+      private
+      def fetch(url, max_redirects:, redirects_followed: 0)
+        if redirects_followed > max_redirects
+          raise TooManyRedirectsError,
+            "Too many redirects (followed #{redirects_followed}, max: #{max_redirects})"
+        end
+        uri = URI.parse(url)
+        config = LoaderRuby.configuration
+        http = Net::HTTP.new(uri.host, uri.port)
+        http.use_ssl = uri.scheme == "https"
+        http.read_timeout = config.http_timeout
+        req = Net::HTTP::Get.new(uri.request_uri)
+        req["User-Agent"] = config.web_user_agent
+        response = http.request(req)
+        case response.code.to_i
+        when 200..299
+          [response.body, response["Content-Type"]]
+        when 301, 302, 303, 307, 308
+          location = response["Location"]
+          # Handle relative redirects
+          location = URI.join(url, location).to_s unless location.start_with?("http")
+          fetch(location, max_redirects: max_redirects, redirects_followed: redirects_followed + 1)
+        else
+          raise Error, "HTTP #{response.code} fetching #{url}"
+        end
+      end
+    end
+  end
+end

data/lib/loader_ruby/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module LoaderRuby
+  VERSION = "0.1.1"
+end

data/lib/loader_ruby.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+require_relative "loader_ruby/version"
+require_relative "loader_ruby/error"
+require_relative "loader_ruby/configuration"
+require_relative "loader_ruby/document"
+require_relative "loader_ruby/html_extractor"
+require_relative "loader_ruby/encoding_detector"
+require_relative "loader_ruby/loaders/base"
+require_relative "loader_ruby/loaders/text"
+require_relative "loader_ruby/loaders/pdf"
+require_relative "loader_ruby/loaders/docx"
+require_relative "loader_ruby/loaders/csv"
+require_relative "loader_ruby/loaders/html"
+require_relative "loader_ruby/loaders/web"
+module LoaderRuby
+  FORMAT_MAP = {
+    ".txt" => Loaders::Text, ".md" => Loaders::Text, ".markdown" => Loaders::Text,
+    ".text" => Loaders::Text, ".log" => Loaders::Text, ".rst" => Loaders::Text,
+    ".pdf" => Loaders::Pdf,
+    ".docx" => Loaders::Docx,
+    ".csv" => Loaders::Csv, ".tsv" => Loaders::Csv,
+    ".html" => Loaders::Html, ".htm" => Loaders::Html
+  }.freeze
+  class << self
+    def configuration
+      @configuration ||= Configuration.new
+    end
+    def configure
+      yield(configuration)
+    end
+    def reset_configuration!
+      @configuration = Configuration.new
+    end
+    def load(source, **opts)
+      raise ArgumentError, "source cannot be nil" if source.nil?
+      raise ArgumentError, "source cannot be empty" if source.is_a?(String) && source.strip.empty?
+      if source.start_with?("http://", "https://")
+        Loaders::Web.new.load(source, **opts)
+      else
+        ext = File.extname(source).downcase
+        loader_class = FORMAT_MAP[ext]
+        raise UnsupportedFormatError, "Unsupported format: #{ext}" unless loader_class
+        loader_class.new.load(source, **opts)
+      end
+    end
+    def load_batch(sources, **opts)
+      sources.map { |source| load(source, **opts) }
+    end
+  end
+end

data/loader-ruby.gemspec ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require_relative "lib/loader_ruby/version"
+Gem::Specification.new do |spec|
+  spec.name = "loader-ruby"
+  spec.version = LoaderRuby::VERSION
+  spec.authors = ["Johannes Dwi Cahyo"]
+  spec.email = ["johannes@example.com"]
+  spec.summary = "Document loader library for Ruby RAG pipelines"
+  spec.description = "Document extraction for RAG pipelines. Loads PDF, DOCX, CSV, HTML, and web pages into a normalized Document format for chunking and embedding."
+  spec.homepage = "https://github.com/johannesdwicahyo/loader-ruby"
+  spec.license = "MIT"
+  spec.required_ruby_version = ">= 3.0.0"
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = spec.homepage
+  spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
+  spec.files = Dir[
+    "lib/**/*.rb",
+    "README.md",
+    "LICENSE",
+    "CHANGELOG.md",
+    "Rakefile",
+    "loader-ruby.gemspec"
+  ]
+  spec.require_paths = ["lib"]
+  spec.add_dependency "csv"
+  spec.add_development_dependency "minitest", "~> 5.0"
+  spec.add_development_dependency "rake", "~> 13.0"
+  spec.add_development_dependency "webmock", "~> 3.0"
+end

metadata ADDED Viewed

@@ -0,0 +1,119 @@
+--- !ruby/object:Gem::Specification
+name: loader-ruby
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+platform: ruby
+authors:
+- Johannes Dwi Cahyo
+bindir: bin
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: csv
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13.0'
+- !ruby/object:Gem::Dependency
+  name: webmock
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+description: Document extraction for RAG pipelines. Loads PDF, DOCX, CSV, HTML, and
+  web pages into a normalized Document format for chunking and embedding.
+email:
+- johannes@example.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- CHANGELOG.md
+- LICENSE
+- README.md
+- Rakefile
+- lib/loader_ruby.rb
+- lib/loader_ruby/configuration.rb
+- lib/loader_ruby/document.rb
+- lib/loader_ruby/encoding_detector.rb
+- lib/loader_ruby/error.rb
+- lib/loader_ruby/html_extractor.rb
+- lib/loader_ruby/loaders/base.rb
+- lib/loader_ruby/loaders/csv.rb
+- lib/loader_ruby/loaders/docx.rb
+- lib/loader_ruby/loaders/html.rb
+- lib/loader_ruby/loaders/pdf.rb
+- lib/loader_ruby/loaders/text.rb
+- lib/loader_ruby/loaders/web.rb
+- lib/loader_ruby/version.rb
+- loader-ruby.gemspec
+homepage: https://github.com/johannesdwicahyo/loader-ruby
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/johannesdwicahyo/loader-ruby
+  source_code_uri: https://github.com/johannesdwicahyo/loader-ruby
+  changelog_uri: https://github.com/johannesdwicahyo/loader-ruby/blob/main/CHANGELOG.md
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 3.0.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.6.9
+specification_version: 4
+summary: Document loader library for Ruby RAG pipelines
+test_files: []