RubyGems - loader-ruby - Versions diffs - 0.1.1 → 0.2.0 - Mend

loader-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/README.md +23 -16
data/lib/loader_ruby/format_detector.rb +44 -0
data/lib/loader_ruby/loaders/email.rb +48 -0
data/lib/loader_ruby/loaders/epub.rb +49 -0
data/lib/loader_ruby/loaders/json_loader.rb +40 -0
data/lib/loader_ruby/loaders/rtf.rb +30 -0
data/lib/loader_ruby/loaders/xlsx.rb +62 -0
data/lib/loader_ruby/loaders/xml.rb +28 -0
data/lib/loader_ruby/parallel_loader.rb +41 -0
data/lib/loader_ruby/streaming_loader.rb +31 -0
data/lib/loader_ruby/version.rb +1 -1
data/lib/loader_ruby.rb +16 -1
metadata +10 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 663d36e070a5a41ba1c34ff5cc7d734b34f92eebbb7a5e104d7e2ceb7f4854f0
-  data.tar.gz: fbea2f56dce6dfca84ff898fa3626c0ec56092d3466fdc82f0327f9cd937a5ba
+  metadata.gz: f2eda1fbd4867015dffcc0364079276820546d4fcaf997a0cd80234f8f5a87b9
+  data.tar.gz: ca732a300d2474380b5bfe4c691198f3c7df44818fac399541bf94c309fca447
 SHA512:
-  metadata.gz: 9cb3f7f935f3779a68524f11af2c734d2219b68c1a7d3b8c0cd841a71052452146d55819d6e91938da7d8904b7d028899beb5b835cd9354be621f6e2ad9de344
-  data.tar.gz: 59df44fe24d1e8a7d15bf48e9da9f72fca101a0ea8ae81b46c9cf9bfc45b86c65dfa5388def61a4ec86bc7a0d7e0b354bd31a214490cddbd75fb78e8b8d928fd
+  metadata.gz: fdcd3ac61ecca175d77f3060d58dfbcb9f5e104e7dc52e3fc295afd6066f46073ba15582837aa0aa5f1395411ccdb31a1d2d46a4f68283f217d437ba80d8d51f
+  data.tar.gz: ab31d1b87da052ae16dff6d93aa215901e208b3480cf4aef46a143cf065714ae2139d5d901ebb8eeaead0427a49412efe7a30aa34248f0694e042c070543f330

data/README.md CHANGED Viewed

@@ -1,16 +1,11 @@
 # loader-ruby
-Document loader library for Ruby RAG pipelines. Extracts text from PDF, DOCX, CSV, HTML, and web pages.
+Document loader library for Ruby RAG pipelines. Load text from PDF, HTML, CSV, DOCX, and web URLs.
 ## Installation
 ```ruby
-gem "loader-ruby", "~> 0.1"
-# Optional dependencies for specific formats:
-gem "pdf-reader"  # PDF support
-gem "nokogiri"    # HTML/web support
-gem "docx"        # DOCX support
+gem "loader-ruby"
 ```
 ## Usage
@@ -18,21 +13,33 @@ gem "docx"        # DOCX support
 ```ruby
 require "loader_ruby"
-doc = LoaderRuby.load("document.pdf")
-doc.content   # => extracted text
-doc.metadata  # => { source: "document.pdf", format: :pdf, pages: 12, ... }
-doc = LoaderRuby.load("notes.md")
+# Auto-detect format from file extension
+doc = LoaderRuby.load("report.pdf")
 doc = LoaderRuby.load("data.csv")
+doc = LoaderRuby.load("page.html")
-docs = LoaderRuby::Loaders::Csv.new.load("data.csv", row_as_document: true)
+# Web loader with redirect handling
+doc = LoaderRuby.load("https://example.com/article")
-doc = LoaderRuby.load("https://example.com/page")
+# PDF with password
+loader = LoaderRuby::Loaders::Pdf.new("encrypted.pdf", password: "secret")
+doc = loader.load
-docs = LoaderRuby.load_batch(["file1.pdf", "file2.docx"])
+# Access content
+doc.content   # => extracted text
+doc.metadata  # => { source: "report.pdf", ... }
 ```
+## Features
+- PDF, HTML, CSV, DOCX, and plain text loaders
+- Web loader with configurable max redirects (default: 5)
+- Encoding auto-detection (BOM, Content-Type charset)
+- Graceful transcoding to UTF-8
+- Shared HTML extraction module
+- Error hierarchy (FileNotFoundError, TooManyRedirectsError, etc.)
+- Input validation for paths and URLs
 ## License
 MIT

data/lib/loader_ruby/format_detector.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module FormatDetector
+    MAGIC_BYTES = {
+      pdf: [0x25, 0x50, 0x44, 0x46],       # %PDF
+      zip: [0x50, 0x4B, 0x03, 0x04],       # PK (XLSX, DOCX, EPUB)
+      rtf: [0x7B, 0x5C, 0x72, 0x74, 0x66], # {\rtf
+    }.freeze
+    def self.detect(path)
+      return nil unless File.exist?(path)
+      bytes = File.binread(path, 8).bytes
+      MAGIC_BYTES.each do |format, signature|
+        if bytes[0, signature.length] == signature
+          return resolve_zip(path) if format == :zip
+          return format
+        end
+      end
+      # Fallback: try content inspection
+      content = File.read(path, 1024, encoding: "UTF-8") rescue nil
+      return nil unless content
+      return :json if content.strip.start_with?("{", "[")
+      return :email if content.match?(/\AFrom:|Subject:|Content-Type:/i)
+      return :html if content.match?(/<html|<!DOCTYPE html/i)
+      return :xml if content.strip.start_with?("<?xml", "<")
+      nil
+    end
+    def self.resolve_zip(path)
+      # Peek inside ZIP to determine specific format
+      content = File.binread(path, 2048)
+      return :docx if content.include?("word/document.xml")
+      return :xlsx if content.include?("xl/workbook.xml")
+      return :epub if content.include?("META-INF/container.xml") || content.include?("mimetype")
+      :zip
+    end
+  end
+end

data/lib/loader_ruby/loaders/email.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Email < Base
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        raw = File.read(path)
+        headers, body = parse_email(raw)
+        Document.new(
+          content: body,
+          metadata: build_metadata(path, format: :email,
+            subject: headers["subject"],
+            from: headers["from"],
+            to: headers["to"],
+            date: headers["date"])
+        )
+      end
+      private
+      def parse_email(raw)
+        # Split headers and body at first blank line
+        parts = raw.split(/\r?\n\r?\n/, 2)
+        header_text = parts[0] || ""
+        body = parts[1] || ""
+        headers = {}
+        header_text.split(/\r?\n/).each do |line|
+          if line.match?(/\A\S+:/)
+            key, value = line.split(":", 2)
+            headers[key.strip.downcase] = value&.strip
+          end
+        end
+        # Strip HTML from body if it looks like HTML
+        if body.include?("<html") || body.include?("<body")
+          body = body.gsub(/<[^>]+>/, " ").gsub(/\s+/, " ").strip
+        end
+        [headers, body.strip]
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/epub.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+require "zip" if defined?(Zip) || begin; require "zip"; rescue LoadError; false; end
+module LoaderRuby
+  module Loaders
+    class Epub < Base
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        begin
+          require "zip"
+        rescue LoadError
+          raise DependencyMissingError, "rubyzip gem is required for EPUB loading"
+        end
+        content = extract_text(path)
+        Document.new(
+          content: content,
+          metadata: build_metadata(path, format: :epub)
+        )
+      end
+      private
+      def extract_text(path)
+        texts = []
+        Zip::File.open(path) do |zip|
+          zip.each do |entry|
+            next unless entry.name.end_with?(".xhtml", ".html", ".htm")
+            html = entry.get_input_stream.read
+            texts << strip_html(html)
+          end
+        end
+        texts.join("\n\n")
+      end
+      def strip_html(html)
+        html.gsub(/<script[^>]*>.*?<\/script>/m, "")
+            .gsub(/<style[^>]*>.*?<\/style>/m, "")
+            .gsub(/<[^>]+>/, " ")
+            .gsub(/\s+/, " ")
+            .strip
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/json_loader.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+require "json"
+module LoaderRuby
+  module Loaders
+    class Json < Base
+      def load(path, text_key: nil, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        raw = File.read(path)
+        data = ::JSON.parse(raw)
+        content = if text_key
+                    extract_by_key(data, text_key)
+                  else
+                    ::JSON.pretty_generate(data)
+                  end
+        Document.new(
+          content: content,
+          metadata: build_metadata(path, format: :json, keys: data.is_a?(Hash) ? data.keys : nil)
+        )
+      end
+      private
+      def extract_by_key(data, key)
+        if data.is_a?(Array)
+          data.map { |item| item.is_a?(Hash) ? item[key].to_s : item.to_s }.join("\n")
+        elsif data.is_a?(Hash)
+          data[key].to_s
+        else
+          data.to_s
+        end
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/rtf.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Rtf < Base
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        raw = File.read(path)
+        content = strip_rtf(raw)
+        Document.new(
+          content: content,
+          metadata: build_metadata(path, format: :rtf)
+        )
+      end
+      private
+      def strip_rtf(text)
+        # Remove RTF control words, keep plain text
+        text = text.gsub(/\\[a-z]+\d*[ ]?/i, "")  # Remove control words
+        text = text.gsub(/[{}]/, "")              # Remove braces
+        text = text.gsub(/\s+/, " ").strip
+        text
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/xlsx.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Xlsx < Base
+      def load(path, sheet: nil, row_as_document: false, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        begin
+          require "roo"
+        rescue LoadError
+          raise DependencyMissingError, "roo gem is required for XLSX loading"
+        end
+        workbook = Roo::Spreadsheet.open(path)
+        worksheet = sheet ? workbook.sheet(sheet) : workbook.sheet(0)
+        if row_as_document
+          load_rows(path, worksheet)
+        else
+          load_all(path, worksheet)
+        end
+      end
+      private
+      def load_all(path, worksheet)
+        rows = []
+        worksheet.each_row_streaming do |row|
+          rows << row.map { |cell| cell&.value.to_s }.join("\t")
+        end
+        Document.new(
+          content: rows.join("\n"),
+          metadata: build_metadata(path, format: :xlsx, rows: rows.size)
+        )
+      end
+      def load_rows(path, worksheet)
+        headers = nil
+        documents = []
+        worksheet.each_row_streaming.each_with_index do |row, i|
+          values = row.map { |cell| cell&.value.to_s }
+          if i == 0
+            headers = values
+            next
+          end
+          content = headers ? headers.zip(values).map { |k, v| "#{k}: #{v}" }.join("\n") : values.join("\t")
+          documents << Document.new(
+            content: content,
+            metadata: build_metadata(path, format: :xlsx, row_index: i, headers: headers)
+          )
+        end
+        documents
+      end
+    end
+  end
+end

data/lib/loader_ruby/loaders/xml.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module LoaderRuby
+  module Loaders
+    class Xml < Base
+      def load(path, **opts)
+        check_file_exists!(path)
+        check_file_size!(path)
+        begin
+          require "nokogiri"
+        rescue LoadError
+          raise DependencyMissingError, "nokogiri gem is required for XML loading"
+        end
+        raw = File.read(path)
+        doc = Nokogiri::XML(raw)
+        content = doc.text.gsub(/\s+/, " ").strip
+        Document.new(
+          content: content,
+          metadata: build_metadata(path, format: :xml, root: doc.root&.name)
+        )
+      end
+    end
+  end
+end

data/lib/loader_ruby/parallel_loader.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+module LoaderRuby
+  class ParallelLoader
+    def initialize(threads: 4)
+      @threads = threads
+    end
+    def load(sources, **opts)
+      return sources.map { |s| LoaderRuby.load(s, **opts) } if @threads <= 1
+      results = Array.new(sources.size)
+      errors = []
+      mutex = Mutex.new
+      work_queue = Queue.new
+      sources.each_with_index { |s, i| work_queue << [s, i] }
+      @threads.times { work_queue << nil }  # Poison pills
+      threads = @threads.times.map do
+        Thread.new do
+          while (item = work_queue.pop)
+            source, index = item
+            begin
+              doc = LoaderRuby.load(source, **opts)
+              mutex.synchronize { results[index] = doc }
+            rescue => e
+              mutex.synchronize { errors << { source: source, error: e } }
+            end
+          end
+        end
+      end
+      threads.each(&:join)
+      raise Error, "#{errors.size} files failed to load" if errors.any? && results.compact.empty?
+      results
+    end
+  end
+end

data/lib/loader_ruby/streaming_loader.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+module LoaderRuby
+  class StreamingLoader
+    DEFAULT_CHUNK_SIZE = 64 * 1024  # 64KB
+    def initialize(chunk_size: DEFAULT_CHUNK_SIZE)
+      @chunk_size = chunk_size
+    end
+    def load(path, &block)
+      raise ArgumentError, "Block required for streaming" unless block_given?
+      raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
+      File.open(path, "rb") do |file|
+        while (chunk = file.read(@chunk_size))
+          yield chunk
+        end
+      end
+    end
+    def load_lines(path, &block)
+      raise ArgumentError, "Block required for streaming" unless block_given?
+      raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
+      File.foreach(path) do |line|
+        yield line
+      end
+    end
+  end
+end

data/lib/loader_ruby/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module LoaderRuby
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

data/lib/loader_ruby.rb CHANGED Viewed

@@ -13,6 +13,15 @@ require_relative "loader_ruby/loaders/docx"
 require_relative "loader_ruby/loaders/csv"
 require_relative "loader_ruby/loaders/html"
 require_relative "loader_ruby/loaders/web"
+require_relative "loader_ruby/loaders/json_loader"
+require_relative "loader_ruby/loaders/xml"
+require_relative "loader_ruby/loaders/epub"
+require_relative "loader_ruby/loaders/rtf"
+require_relative "loader_ruby/loaders/email"
+require_relative "loader_ruby/loaders/xlsx"
+require_relative "loader_ruby/format_detector"
+require_relative "loader_ruby/parallel_loader"
+require_relative "loader_ruby/streaming_loader"
 module LoaderRuby
   FORMAT_MAP = {
@@ -21,7 +30,13 @@ module LoaderRuby
     ".pdf" => Loaders::Pdf,
     ".docx" => Loaders::Docx,
     ".csv" => Loaders::Csv, ".tsv" => Loaders::Csv,
-    ".html" => Loaders::Html, ".htm" => Loaders::Html
+    ".html" => Loaders::Html, ".htm" => Loaders::Html,
+    ".json" => Loaders::Json,
+    ".xml" => Loaders::Xml,
+    ".epub" => Loaders::Epub,
+    ".rtf" => Loaders::Rtf,
+    ".eml" => Loaders::Email,
+    ".xlsx" => Loaders::Xlsx, ".xls" => Loaders::Xlsx
   }.freeze
   class << self

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: loader-ruby
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Johannes Dwi Cahyo
@@ -82,14 +82,23 @@ files:
 - lib/loader_ruby/document.rb
 - lib/loader_ruby/encoding_detector.rb
 - lib/loader_ruby/error.rb
+- lib/loader_ruby/format_detector.rb
 - lib/loader_ruby/html_extractor.rb
 - lib/loader_ruby/loaders/base.rb
 - lib/loader_ruby/loaders/csv.rb
 - lib/loader_ruby/loaders/docx.rb
+- lib/loader_ruby/loaders/email.rb
+- lib/loader_ruby/loaders/epub.rb
 - lib/loader_ruby/loaders/html.rb
+- lib/loader_ruby/loaders/json_loader.rb
 - lib/loader_ruby/loaders/pdf.rb
+- lib/loader_ruby/loaders/rtf.rb
 - lib/loader_ruby/loaders/text.rb
 - lib/loader_ruby/loaders/web.rb
+- lib/loader_ruby/loaders/xlsx.rb
+- lib/loader_ruby/loaders/xml.rb
+- lib/loader_ruby/parallel_loader.rb
+- lib/loader_ruby/streaming_loader.rb
 - lib/loader_ruby/version.rb
 - loader-ruby.gemspec
 homepage: https://github.com/johannesdwicahyo/loader-ruby