loader-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 663d36e070a5a41ba1c34ff5cc7d734b34f92eebbb7a5e104d7e2ceb7f4854f0
4
- data.tar.gz: fbea2f56dce6dfca84ff898fa3626c0ec56092d3466fdc82f0327f9cd937a5ba
3
+ metadata.gz: f2eda1fbd4867015dffcc0364079276820546d4fcaf997a0cd80234f8f5a87b9
4
+ data.tar.gz: ca732a300d2474380b5bfe4c691198f3c7df44818fac399541bf94c309fca447
5
5
  SHA512:
6
- metadata.gz: 9cb3f7f935f3779a68524f11af2c734d2219b68c1a7d3b8c0cd841a71052452146d55819d6e91938da7d8904b7d028899beb5b835cd9354be621f6e2ad9de344
7
- data.tar.gz: 59df44fe24d1e8a7d15bf48e9da9f72fca101a0ea8ae81b46c9cf9bfc45b86c65dfa5388def61a4ec86bc7a0d7e0b354bd31a214490cddbd75fb78e8b8d928fd
6
+ metadata.gz: fdcd3ac61ecca175d77f3060d58dfbcb9f5e104e7dc52e3fc295afd6066f46073ba15582837aa0aa5f1395411ccdb31a1d2d46a4f68283f217d437ba80d8d51f
7
+ data.tar.gz: ab31d1b87da052ae16dff6d93aa215901e208b3480cf4aef46a143cf065714ae2139d5d901ebb8eeaead0427a49412efe7a30aa34248f0694e042c070543f330
data/README.md CHANGED
@@ -1,16 +1,11 @@
1
1
  # loader-ruby
2
2
 
3
- Document loader library for Ruby RAG pipelines. Extracts text from PDF, DOCX, CSV, HTML, and web pages.
3
+ Document loader library for Ruby RAG pipelines. Load text from PDF, HTML, CSV, DOCX, and web URLs.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  ```ruby
8
- gem "loader-ruby", "~> 0.1"
9
-
10
- # Optional dependencies for specific formats:
11
- gem "pdf-reader" # PDF support
12
- gem "nokogiri" # HTML/web support
13
- gem "docx" # DOCX support
8
+ gem "loader-ruby"
14
9
  ```
15
10
 
16
11
  ## Usage
@@ -18,21 +13,33 @@ gem "docx" # DOCX support
18
13
  ```ruby
19
14
  require "loader_ruby"
20
15
 
21
- doc = LoaderRuby.load("document.pdf")
22
- doc.content # => extracted text
23
- doc.metadata # => { source: "document.pdf", format: :pdf, pages: 12, ... }
24
-
25
- doc = LoaderRuby.load("notes.md")
26
-
16
+ # Auto-detect format from file extension
17
+ doc = LoaderRuby.load("report.pdf")
27
18
  doc = LoaderRuby.load("data.csv")
19
+ doc = LoaderRuby.load("page.html")
28
20
 
29
- docs = LoaderRuby::Loaders::Csv.new.load("data.csv", row_as_document: true)
21
+ # Web loader with redirect handling
22
+ doc = LoaderRuby.load("https://example.com/article")
30
23
 
31
- doc = LoaderRuby.load("https://example.com/page")
24
+ # PDF with password
25
+ loader = LoaderRuby::Loaders::Pdf.new("encrypted.pdf", password: "secret")
26
+ doc = loader.load
32
27
 
33
- docs = LoaderRuby.load_batch(["file1.pdf", "file2.docx"])
28
+ # Access content
29
+ doc.content # => extracted text
30
+ doc.metadata # => { source: "report.pdf", ... }
34
31
  ```
35
32
 
33
+ ## Features
34
+
35
+ - PDF, HTML, CSV, DOCX, and plain text loaders
36
+ - Web loader with configurable max redirects (default: 5)
37
+ - Encoding auto-detection (BOM, Content-Type charset)
38
+ - Graceful transcoding to UTF-8
39
+ - Shared HTML extraction module
40
+ - Error hierarchy (FileNotFoundError, TooManyRedirectsError, etc.)
41
+ - Input validation for paths and URLs
42
+
36
43
  ## License
37
44
 
38
45
  MIT
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module FormatDetector
5
+ MAGIC_BYTES = {
6
+ pdf: [0x25, 0x50, 0x44, 0x46], # %PDF
7
+ zip: [0x50, 0x4B, 0x03, 0x04], # PK (XLSX, DOCX, EPUB)
8
+ rtf: [0x7B, 0x5C, 0x72, 0x74, 0x66], # {\rtf
9
+ }.freeze
10
+
11
+ def self.detect(path)
12
+ return nil unless File.exist?(path)
13
+
14
+ bytes = File.binread(path, 8).bytes
15
+
16
+ MAGIC_BYTES.each do |format, signature|
17
+ if bytes[0, signature.length] == signature
18
+ return resolve_zip(path) if format == :zip
19
+ return format
20
+ end
21
+ end
22
+
23
+ # Fallback: try content inspection
24
+ content = File.read(path, 1024, encoding: "UTF-8") rescue nil
25
+ return nil unless content
26
+
27
+ return :json if content.strip.start_with?("{", "[")
28
+ return :email if content.match?(/\AFrom:|Subject:|Content-Type:/i)
29
+ return :html if content.match?(/<html|<!DOCTYPE html/i)
30
+ return :xml if content.strip.start_with?("<?xml", "<")
31
+
32
+ nil
33
+ end
34
+
35
+ def self.resolve_zip(path)
36
+ # Peek inside ZIP to determine specific format
37
+ content = File.binread(path, 2048)
38
+ return :docx if content.include?("word/document.xml")
39
+ return :xlsx if content.include?("xl/workbook.xml")
40
+ return :epub if content.include?("META-INF/container.xml") || content.include?("mimetype")
41
+ :zip
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Email < Base
6
+ def load(path, **opts)
7
+ check_file_exists!(path)
8
+ check_file_size!(path)
9
+
10
+ raw = File.read(path)
11
+ headers, body = parse_email(raw)
12
+
13
+ Document.new(
14
+ content: body,
15
+ metadata: build_metadata(path, format: :email,
16
+ subject: headers["subject"],
17
+ from: headers["from"],
18
+ to: headers["to"],
19
+ date: headers["date"])
20
+ )
21
+ end
22
+
23
+ private
24
+
25
+ def parse_email(raw)
26
+ # Split headers and body at first blank line
27
+ parts = raw.split(/\r?\n\r?\n/, 2)
28
+ header_text = parts[0] || ""
29
+ body = parts[1] || ""
30
+
31
+ headers = {}
32
+ header_text.split(/\r?\n/).each do |line|
33
+ if line.match?(/\A\S+:/)
34
+ key, value = line.split(":", 2)
35
+ headers[key.strip.downcase] = value&.strip
36
+ end
37
+ end
38
+
39
+ # Strip HTML from body if it looks like HTML
40
+ if body.include?("<html") || body.include?("<body")
41
+ body = body.gsub(/<[^>]+>/, " ").gsub(/\s+/, " ").strip
42
+ end
43
+
44
+ [headers, body.strip]
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zip" if defined?(Zip) || begin; require "zip"; rescue LoadError; false; end
4
+
5
+ module LoaderRuby
6
+ module Loaders
7
+ class Epub < Base
8
+ def load(path, **opts)
9
+ check_file_exists!(path)
10
+ check_file_size!(path)
11
+
12
+ begin
13
+ require "zip"
14
+ rescue LoadError
15
+ raise DependencyMissingError, "rubyzip gem is required for EPUB loading"
16
+ end
17
+
18
+ content = extract_text(path)
19
+
20
+ Document.new(
21
+ content: content,
22
+ metadata: build_metadata(path, format: :epub)
23
+ )
24
+ end
25
+
26
+ private
27
+
28
+ def extract_text(path)
29
+ texts = []
30
+ Zip::File.open(path) do |zip|
31
+ zip.each do |entry|
32
+ next unless entry.name.end_with?(".xhtml", ".html", ".htm")
33
+ html = entry.get_input_stream.read
34
+ texts << strip_html(html)
35
+ end
36
+ end
37
+ texts.join("\n\n")
38
+ end
39
+
40
+ def strip_html(html)
41
+ html.gsub(/<script[^>]*>.*?<\/script>/m, "")
42
+ .gsub(/<style[^>]*>.*?<\/style>/m, "")
43
+ .gsub(/<[^>]+>/, " ")
44
+ .gsub(/\s+/, " ")
45
+ .strip
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module LoaderRuby
6
+ module Loaders
7
+ class Json < Base
8
+ def load(path, text_key: nil, **opts)
9
+ check_file_exists!(path)
10
+ check_file_size!(path)
11
+
12
+ raw = File.read(path)
13
+ data = ::JSON.parse(raw)
14
+
15
+ content = if text_key
16
+ extract_by_key(data, text_key)
17
+ else
18
+ ::JSON.pretty_generate(data)
19
+ end
20
+
21
+ Document.new(
22
+ content: content,
23
+ metadata: build_metadata(path, format: :json, keys: data.is_a?(Hash) ? data.keys : nil)
24
+ )
25
+ end
26
+
27
+ private
28
+
29
+ def extract_by_key(data, key)
30
+ if data.is_a?(Array)
31
+ data.map { |item| item.is_a?(Hash) ? item[key].to_s : item.to_s }.join("\n")
32
+ elsif data.is_a?(Hash)
33
+ data[key].to_s
34
+ else
35
+ data.to_s
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Rtf < Base
6
+ def load(path, **opts)
7
+ check_file_exists!(path)
8
+ check_file_size!(path)
9
+
10
+ raw = File.read(path)
11
+ content = strip_rtf(raw)
12
+
13
+ Document.new(
14
+ content: content,
15
+ metadata: build_metadata(path, format: :rtf)
16
+ )
17
+ end
18
+
19
+ private
20
+
21
+ def strip_rtf(text)
22
+ # Remove RTF control words, keep plain text
23
+ text = text.gsub(/\\[a-z]+\d*[ ]?/i, "") # Remove control words
24
+ text = text.gsub(/[{}]/, "") # Remove braces
25
+ text = text.gsub(/\s+/, " ").strip
26
+ text
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Xlsx < Base
6
+ def load(path, sheet: nil, row_as_document: false, **opts)
7
+ check_file_exists!(path)
8
+ check_file_size!(path)
9
+
10
+ begin
11
+ require "roo"
12
+ rescue LoadError
13
+ raise DependencyMissingError, "roo gem is required for XLSX loading"
14
+ end
15
+
16
+ workbook = Roo::Spreadsheet.open(path)
17
+ worksheet = sheet ? workbook.sheet(sheet) : workbook.sheet(0)
18
+
19
+ if row_as_document
20
+ load_rows(path, worksheet)
21
+ else
22
+ load_all(path, worksheet)
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def load_all(path, worksheet)
29
+ rows = []
30
+ worksheet.each_row_streaming do |row|
31
+ rows << row.map { |cell| cell&.value.to_s }.join("\t")
32
+ end
33
+
34
+ Document.new(
35
+ content: rows.join("\n"),
36
+ metadata: build_metadata(path, format: :xlsx, rows: rows.size)
37
+ )
38
+ end
39
+
40
+ def load_rows(path, worksheet)
41
+ headers = nil
42
+ documents = []
43
+
44
+ worksheet.each_row_streaming.each_with_index do |row, i|
45
+ values = row.map { |cell| cell&.value.to_s }
46
+ if i == 0
47
+ headers = values
48
+ next
49
+ end
50
+
51
+ content = headers ? headers.zip(values).map { |k, v| "#{k}: #{v}" }.join("\n") : values.join("\t")
52
+ documents << Document.new(
53
+ content: content,
54
+ metadata: build_metadata(path, format: :xlsx, row_index: i, headers: headers)
55
+ )
56
+ end
57
+
58
+ documents
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Xml < Base
6
+ def load(path, **opts)
7
+ check_file_exists!(path)
8
+ check_file_size!(path)
9
+
10
+ begin
11
+ require "nokogiri"
12
+ rescue LoadError
13
+ raise DependencyMissingError, "nokogiri gem is required for XML loading"
14
+ end
15
+
16
+ raw = File.read(path)
17
+ doc = Nokogiri::XML(raw)
18
+
19
+ content = doc.text.gsub(/\s+/, " ").strip
20
+
21
+ Document.new(
22
+ content: content,
23
+ metadata: build_metadata(path, format: :xml, root: doc.root&.name)
24
+ )
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ class ParallelLoader
5
+ def initialize(threads: 4)
6
+ @threads = threads
7
+ end
8
+
9
+ def load(sources, **opts)
10
+ return sources.map { |s| LoaderRuby.load(s, **opts) } if @threads <= 1
11
+
12
+ results = Array.new(sources.size)
13
+ errors = []
14
+ mutex = Mutex.new
15
+
16
+ work_queue = Queue.new
17
+ sources.each_with_index { |s, i| work_queue << [s, i] }
18
+ @threads.times { work_queue << nil } # Poison pills
19
+
20
+ threads = @threads.times.map do
21
+ Thread.new do
22
+ while (item = work_queue.pop)
23
+ source, index = item
24
+ begin
25
+ doc = LoaderRuby.load(source, **opts)
26
+ mutex.synchronize { results[index] = doc }
27
+ rescue => e
28
+ mutex.synchronize { errors << { source: source, error: e } }
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ threads.each(&:join)
35
+
36
+ raise Error, "#{errors.size} files failed to load" if errors.any? && results.compact.empty?
37
+
38
+ results
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ class StreamingLoader
5
+ DEFAULT_CHUNK_SIZE = 64 * 1024 # 64KB
6
+
7
+ def initialize(chunk_size: DEFAULT_CHUNK_SIZE)
8
+ @chunk_size = chunk_size
9
+ end
10
+
11
+ def load(path, &block)
12
+ raise ArgumentError, "Block required for streaming" unless block_given?
13
+ raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
14
+
15
+ File.open(path, "rb") do |file|
16
+ while (chunk = file.read(@chunk_size))
17
+ yield chunk
18
+ end
19
+ end
20
+ end
21
+
22
+ def load_lines(path, &block)
23
+ raise ArgumentError, "Block required for streaming" unless block_given?
24
+ raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
25
+
26
+ File.foreach(path) do |line|
27
+ yield line
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LoaderRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/loader_ruby.rb CHANGED
@@ -13,6 +13,15 @@ require_relative "loader_ruby/loaders/docx"
13
13
  require_relative "loader_ruby/loaders/csv"
14
14
  require_relative "loader_ruby/loaders/html"
15
15
  require_relative "loader_ruby/loaders/web"
16
+ require_relative "loader_ruby/loaders/json_loader"
17
+ require_relative "loader_ruby/loaders/xml"
18
+ require_relative "loader_ruby/loaders/epub"
19
+ require_relative "loader_ruby/loaders/rtf"
20
+ require_relative "loader_ruby/loaders/email"
21
+ require_relative "loader_ruby/loaders/xlsx"
22
+ require_relative "loader_ruby/format_detector"
23
+ require_relative "loader_ruby/parallel_loader"
24
+ require_relative "loader_ruby/streaming_loader"
16
25
 
17
26
  module LoaderRuby
18
27
  FORMAT_MAP = {
@@ -21,7 +30,13 @@ module LoaderRuby
21
30
  ".pdf" => Loaders::Pdf,
22
31
  ".docx" => Loaders::Docx,
23
32
  ".csv" => Loaders::Csv, ".tsv" => Loaders::Csv,
24
- ".html" => Loaders::Html, ".htm" => Loaders::Html
33
+ ".html" => Loaders::Html, ".htm" => Loaders::Html,
34
+ ".json" => Loaders::Json,
35
+ ".xml" => Loaders::Xml,
36
+ ".epub" => Loaders::Epub,
37
+ ".rtf" => Loaders::Rtf,
38
+ ".eml" => Loaders::Email,
39
+ ".xlsx" => Loaders::Xlsx, ".xls" => Loaders::Xlsx
25
40
  }.freeze
26
41
 
27
42
  class << self
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loader-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -82,14 +82,23 @@ files:
82
82
  - lib/loader_ruby/document.rb
83
83
  - lib/loader_ruby/encoding_detector.rb
84
84
  - lib/loader_ruby/error.rb
85
+ - lib/loader_ruby/format_detector.rb
85
86
  - lib/loader_ruby/html_extractor.rb
86
87
  - lib/loader_ruby/loaders/base.rb
87
88
  - lib/loader_ruby/loaders/csv.rb
88
89
  - lib/loader_ruby/loaders/docx.rb
90
+ - lib/loader_ruby/loaders/email.rb
91
+ - lib/loader_ruby/loaders/epub.rb
89
92
  - lib/loader_ruby/loaders/html.rb
93
+ - lib/loader_ruby/loaders/json_loader.rb
90
94
  - lib/loader_ruby/loaders/pdf.rb
95
+ - lib/loader_ruby/loaders/rtf.rb
91
96
  - lib/loader_ruby/loaders/text.rb
92
97
  - lib/loader_ruby/loaders/web.rb
98
+ - lib/loader_ruby/loaders/xlsx.rb
99
+ - lib/loader_ruby/loaders/xml.rb
100
+ - lib/loader_ruby/parallel_loader.rb
101
+ - lib/loader_ruby/streaming_loader.rb
93
102
  - lib/loader_ruby/version.rb
94
103
  - loader-ruby.gemspec
95
104
  homepage: https://github.com/johannesdwicahyo/loader-ruby