loader-ruby 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +14 -0
- data/LICENSE +21 -0
- data/README.md +38 -0
- data/Rakefile +11 -0
- data/lib/loader_ruby/configuration.rb +15 -0
- data/lib/loader_ruby/document.rb +43 -0
- data/lib/loader_ruby/encoding_detector.rb +61 -0
- data/lib/loader_ruby/error.rb +11 -0
- data/lib/loader_ruby/html_extractor.rb +32 -0
- data/lib/loader_ruby/loaders/base.rb +51 -0
- data/lib/loader_ruby/loaders/csv.rb +55 -0
- data/lib/loader_ruby/loaders/docx.rb +33 -0
- data/lib/loader_ruby/loaders/html.rb +34 -0
- data/lib/loader_ruby/loaders/pdf.rb +36 -0
- data/lib/loader_ruby/loaders/text.rb +27 -0
- data/lib/loader_ruby/loaders/web.rb +94 -0
- data/lib/loader_ruby/version.rb +5 -0
- data/lib/loader_ruby.rb +59 -0
- data/loader-ruby.gemspec +35 -0
- metadata +119 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 663d36e070a5a41ba1c34ff5cc7d734b34f92eebbb7a5e104d7e2ceb7f4854f0
|
|
4
|
+
data.tar.gz: fbea2f56dce6dfca84ff898fa3626c0ec56092d3466fdc82f0327f9cd937a5ba
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 9cb3f7f935f3779a68524f11af2c734d2219b68c1a7d3b8c0cd841a71052452146d55819d6e91938da7d8904b7d028899beb5b835cd9354be621f6e2ad9de344
|
|
7
|
+
data.tar.gz: 59df44fe24d1e8a7d15bf48e9da9f72fca101a0ea8ae81b46c9cf9bfc45b86c65dfa5388def61a4ec86bc7a0d7e0b354bd31a214490cddbd75fb78e8b8d928fd
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2026-03-09)
|
|
4
|
+
|
|
5
|
+
- Initial release
|
|
6
|
+
- Text/Markdown loader
|
|
7
|
+
- PDF loader (via pdf-reader gem)
|
|
8
|
+
- DOCX loader (via docx gem)
|
|
9
|
+
- CSV/TSV loader with row-as-document support
|
|
10
|
+
- HTML loader (via nokogiri gem)
|
|
11
|
+
- Web page loader with HTTP fetching
|
|
12
|
+
- Auto-detection of file format
|
|
13
|
+
- Normalized Document result object
|
|
14
|
+
- Configuration DSL
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Johannes Dwi Cahyo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# loader-ruby
|
|
2
|
+
|
|
3
|
+
Document loader library for Ruby RAG pipelines. Extracts text from PDF, DOCX, CSV, HTML, and web pages.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
gem "loader-ruby", "~> 0.1"
|
|
9
|
+
|
|
10
|
+
# Optional dependencies for specific formats:
|
|
11
|
+
gem "pdf-reader" # PDF support
|
|
12
|
+
gem "nokogiri" # HTML/web support
|
|
13
|
+
gem "docx" # DOCX support
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
require "loader_ruby"
|
|
20
|
+
|
|
21
|
+
doc = LoaderRuby.load("document.pdf")
|
|
22
|
+
doc.content # => extracted text
|
|
23
|
+
doc.metadata # => { source: "document.pdf", format: :pdf, pages: 12, ... }
|
|
24
|
+
|
|
25
|
+
doc = LoaderRuby.load("notes.md")
|
|
26
|
+
|
|
27
|
+
doc = LoaderRuby.load("data.csv")
|
|
28
|
+
|
|
29
|
+
docs = LoaderRuby::Loaders::Csv.new.load("data.csv", row_as_document: true)
|
|
30
|
+
|
|
31
|
+
doc = LoaderRuby.load("https://example.com/page")
|
|
32
|
+
|
|
33
|
+
docs = LoaderRuby.load_batch(["file1.pdf", "file2.docx"])
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## License
|
|
37
|
+
|
|
38
|
+
MIT
|
data/Rakefile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
class Configuration
|
|
5
|
+
attr_accessor :default_encoding, :max_file_size, :http_timeout,
|
|
6
|
+
:web_user_agent
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@default_encoding = "UTF-8"
|
|
10
|
+
@max_file_size = 100 * 1024 * 1024
|
|
11
|
+
@http_timeout = 30
|
|
12
|
+
@web_user_agent = "LoaderRuby/#{VERSION}"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
class Document
|
|
5
|
+
attr_reader :content, :metadata
|
|
6
|
+
|
|
7
|
+
def initialize(content:, metadata: {})
|
|
8
|
+
@content = content
|
|
9
|
+
@metadata = metadata
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def source
|
|
13
|
+
@metadata[:source]
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def format
|
|
17
|
+
@metadata[:format]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def pages
|
|
21
|
+
@metadata[:pages]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def size
|
|
25
|
+
@content.length
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def empty?
|
|
29
|
+
@content.nil? || @content.strip.empty?
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def to_h
|
|
33
|
+
{
|
|
34
|
+
content: @content,
|
|
35
|
+
metadata: @metadata
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def to_s
|
|
40
|
+
"Document(source: #{source}, format: #{self.format}, size: #{size})"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
# Detects file encoding from BOM or content-type header and transcodes to UTF-8.
|
|
5
|
+
module EncodingDetector
|
|
6
|
+
BOM_MAP = {
|
|
7
|
+
"\xEF\xBB\xBF".b => "UTF-8",
|
|
8
|
+
"\xFF\xFE".b => "UTF-16LE",
|
|
9
|
+
"\xFE\xFF".b => "UTF-16BE",
|
|
10
|
+
"\xFF\xFE\x00\x00".b => "UTF-32LE",
|
|
11
|
+
"\x00\x00\xFE\xFF".b => "UTF-32BE"
|
|
12
|
+
}.freeze
|
|
13
|
+
|
|
14
|
+
private
|
|
15
|
+
|
|
16
|
+
# Detect encoding from BOM bytes at the start of raw content.
|
|
17
|
+
def detect_encoding_from_bom(raw_bytes)
|
|
18
|
+
# Check 4-byte BOMs first, then 3-byte, then 2-byte
|
|
19
|
+
if raw_bytes.bytesize >= 4
|
|
20
|
+
bom4 = raw_bytes.byteslice(0, 4)
|
|
21
|
+
return BOM_MAP[bom4] if BOM_MAP.key?(bom4)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
if raw_bytes.bytesize >= 3
|
|
25
|
+
bom3 = raw_bytes.byteslice(0, 3)
|
|
26
|
+
return BOM_MAP[bom3] if BOM_MAP.key?(bom3)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
if raw_bytes.bytesize >= 2
|
|
30
|
+
bom2 = raw_bytes.byteslice(0, 2)
|
|
31
|
+
return BOM_MAP[bom2] if BOM_MAP.key?(bom2)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
nil
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Detect encoding from a Content-Type header value, e.g. "text/html; charset=iso-8859-1"
|
|
38
|
+
def detect_encoding_from_content_type(content_type)
|
|
39
|
+
return nil unless content_type
|
|
40
|
+
|
|
41
|
+
if content_type =~ /charset=([^\s;]+)/i
|
|
42
|
+
$1.strip
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Transcode content to UTF-8 from the detected or specified encoding.
|
|
47
|
+
# Returns a UTF-8 encoded string with invalid/undefined bytes replaced.
|
|
48
|
+
def transcode_to_utf8(content, source_encoding)
|
|
49
|
+
return content if source_encoding.nil?
|
|
50
|
+
|
|
51
|
+
normalized = source_encoding.upcase.strip
|
|
52
|
+
return content if normalized == "UTF-8" && content.encoding == ::Encoding::UTF_8 && content.valid_encoding?
|
|
53
|
+
|
|
54
|
+
begin
|
|
55
|
+
content.encode("UTF-8", source_encoding, invalid: :replace, undef: :replace)
|
|
56
|
+
rescue ::EncodingError => e
|
|
57
|
+
raise LoaderRuby::EncodingError, "Failed to transcode from #{source_encoding} to UTF-8: #{e.message}"
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
class Error < StandardError; end
|
|
5
|
+
class FileNotFoundError < Error; end
|
|
6
|
+
class UnsupportedFormatError < Error; end
|
|
7
|
+
class FileTooLargeError < Error; end
|
|
8
|
+
class DependencyMissingError < Error; end
|
|
9
|
+
class TooManyRedirectsError < Error; end
|
|
10
|
+
class EncodingError < Error; end
|
|
11
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
# Shared HTML-to-text extraction logic used by both Html and Web loaders.
|
|
5
|
+
module HtmlExtractor
|
|
6
|
+
REMOVE_SELECTORS = "script, style, nav, footer, header"
|
|
7
|
+
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def require_nokogiri!
|
|
11
|
+
require "nokogiri"
|
|
12
|
+
rescue LoadError
|
|
13
|
+
raise DependencyMissingError,
|
|
14
|
+
"nokogiri gem is required for HTML loading. Add `gem 'nokogiri'` to your Gemfile."
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def parse_html(html)
|
|
18
|
+
doc = Nokogiri::HTML(html)
|
|
19
|
+
doc.css(REMOVE_SELECTORS).remove
|
|
20
|
+
doc
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def extract_title(doc)
|
|
24
|
+
doc.at_css("title")&.text&.strip
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def extract_text(doc)
|
|
28
|
+
body = doc.at_css("body") || doc
|
|
29
|
+
body.text.gsub(/\s+/, " ").strip
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module LoaderRuby
|
|
6
|
+
module Loaders
|
|
7
|
+
class Base
|
|
8
|
+
def load(source, **opts)
|
|
9
|
+
raise NotImplementedError, "#{self.class}#load not implemented"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
private
|
|
13
|
+
|
|
14
|
+
def validate_path!(path)
|
|
15
|
+
raise ArgumentError, "path cannot be nil" if path.nil?
|
|
16
|
+
raise ArgumentError, "path cannot be empty" if path.is_a?(String) && path.strip.empty?
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def validate_url!(url)
|
|
20
|
+
raise ArgumentError, "URL cannot be nil" if url.nil?
|
|
21
|
+
raise ArgumentError, "URL cannot be empty" if url.is_a?(String) && url.strip.empty?
|
|
22
|
+
|
|
23
|
+
uri = URI.parse(url)
|
|
24
|
+
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
25
|
+
raise ArgumentError, "Invalid URL: #{url}"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def check_file_exists!(path)
|
|
30
|
+
validate_path!(path)
|
|
31
|
+
raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def check_file_size!(path)
|
|
35
|
+
max = LoaderRuby.configuration.max_file_size
|
|
36
|
+
size = File.size(path)
|
|
37
|
+
if size > max
|
|
38
|
+
raise FileTooLargeError, "File too large: #{size} bytes (max: #{max})"
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def build_metadata(source, format:, **extra)
|
|
43
|
+
{
|
|
44
|
+
source: source,
|
|
45
|
+
format: format,
|
|
46
|
+
loaded_at: Time.now.iso8601
|
|
47
|
+
}.merge(extra)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module LoaderRuby
|
|
6
|
+
module Loaders
|
|
7
|
+
class Csv < Base
|
|
8
|
+
EXTENSIONS = %w[.csv .tsv].freeze
|
|
9
|
+
|
|
10
|
+
def load(path, row_as_document: false, **opts)
|
|
11
|
+
check_file_exists!(path)
|
|
12
|
+
check_file_size!(path)
|
|
13
|
+
|
|
14
|
+
separator = path.end_with?(".tsv") ? "\t" : ","
|
|
15
|
+
table = ::CSV.read(path, headers: true, col_sep: separator)
|
|
16
|
+
|
|
17
|
+
if row_as_document
|
|
18
|
+
load_rows_as_documents(path, table)
|
|
19
|
+
else
|
|
20
|
+
load_as_single_document(path, table)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def load_as_single_document(path, table)
|
|
27
|
+
content = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(", ") }.join("\n")
|
|
28
|
+
|
|
29
|
+
Document.new(
|
|
30
|
+
content: content,
|
|
31
|
+
metadata: build_metadata(path,
|
|
32
|
+
format: :csv,
|
|
33
|
+
rows: table.size,
|
|
34
|
+
headers: table.headers
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def load_rows_as_documents(path, table)
|
|
40
|
+
table.map.with_index do |row, i|
|
|
41
|
+
content = row.to_h.map { |k, v| "#{k}: #{v}" }.join("\n")
|
|
42
|
+
|
|
43
|
+
Document.new(
|
|
44
|
+
content: content,
|
|
45
|
+
metadata: build_metadata(path,
|
|
46
|
+
format: :csv,
|
|
47
|
+
row_index: i,
|
|
48
|
+
headers: table.headers
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Docx < Base
|
|
6
|
+
EXTENSIONS = %w[.docx].freeze
|
|
7
|
+
|
|
8
|
+
def load(path, **opts)
|
|
9
|
+
check_file_exists!(path)
|
|
10
|
+
check_file_size!(path)
|
|
11
|
+
|
|
12
|
+
begin
|
|
13
|
+
require "docx"
|
|
14
|
+
rescue LoadError
|
|
15
|
+
raise DependencyMissingError,
|
|
16
|
+
"docx gem is required for DOCX loading. Add `gem 'docx'` to your Gemfile."
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
doc = ::Docx::Document.open(path)
|
|
20
|
+
paragraphs = doc.paragraphs.map(&:text)
|
|
21
|
+
content = paragraphs.join("\n")
|
|
22
|
+
|
|
23
|
+
Document.new(
|
|
24
|
+
content: content,
|
|
25
|
+
metadata: build_metadata(path,
|
|
26
|
+
format: :docx,
|
|
27
|
+
paragraphs: paragraphs.size
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Html < Base
|
|
6
|
+
include HtmlExtractor
|
|
7
|
+
include EncodingDetector
|
|
8
|
+
|
|
9
|
+
EXTENSIONS = %w[.html .htm].freeze
|
|
10
|
+
|
|
11
|
+
def load(path, **opts)
|
|
12
|
+
check_file_exists!(path)
|
|
13
|
+
check_file_size!(path)
|
|
14
|
+
require_nokogiri!
|
|
15
|
+
|
|
16
|
+
raw = File.binread(path)
|
|
17
|
+
detected = detect_encoding_from_bom(raw)
|
|
18
|
+
html = transcode_to_utf8(raw, detected || "UTF-8")
|
|
19
|
+
|
|
20
|
+
doc = parse_html(html)
|
|
21
|
+
title = extract_title(doc)
|
|
22
|
+
content = extract_text(doc)
|
|
23
|
+
|
|
24
|
+
Document.new(
|
|
25
|
+
content: content,
|
|
26
|
+
metadata: build_metadata(path,
|
|
27
|
+
format: :html,
|
|
28
|
+
title: title
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Pdf < Base
|
|
6
|
+
EXTENSIONS = %w[.pdf].freeze
|
|
7
|
+
|
|
8
|
+
def load(path, password: nil, **opts)
|
|
9
|
+
check_file_exists!(path)
|
|
10
|
+
check_file_size!(path)
|
|
11
|
+
|
|
12
|
+
begin
|
|
13
|
+
require "pdf-reader"
|
|
14
|
+
rescue LoadError
|
|
15
|
+
raise DependencyMissingError,
|
|
16
|
+
"pdf-reader gem is required for PDF loading. Add `gem 'pdf-reader'` to your Gemfile."
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
reader_opts = {}
|
|
20
|
+
reader_opts[:password] = password if password
|
|
21
|
+
reader = PDF::Reader.new(path, **reader_opts)
|
|
22
|
+
pages = reader.pages.map(&:text)
|
|
23
|
+
content = pages.join("\n\n")
|
|
24
|
+
|
|
25
|
+
Document.new(
|
|
26
|
+
content: content,
|
|
27
|
+
metadata: build_metadata(path,
|
|
28
|
+
format: :pdf,
|
|
29
|
+
pages: reader.page_count,
|
|
30
|
+
info: reader.info
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Text < Base
|
|
6
|
+
include EncodingDetector
|
|
7
|
+
|
|
8
|
+
EXTENSIONS = %w[.txt .md .markdown .text .log .rst].freeze
|
|
9
|
+
|
|
10
|
+
def load(path, **opts)
|
|
11
|
+
check_file_exists!(path)
|
|
12
|
+
check_file_size!(path)
|
|
13
|
+
|
|
14
|
+
explicit_encoding = opts[:encoding]
|
|
15
|
+
|
|
16
|
+
raw = File.binread(path)
|
|
17
|
+
detected = explicit_encoding || detect_encoding_from_bom(raw) || LoaderRuby.configuration.default_encoding
|
|
18
|
+
content = transcode_to_utf8(raw, detected)
|
|
19
|
+
|
|
20
|
+
Document.new(
|
|
21
|
+
content: content,
|
|
22
|
+
metadata: build_metadata(path, format: :text, encoding: detected)
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "set"
|
|
6
|
+
|
|
7
|
+
module LoaderRuby
|
|
8
|
+
module Loaders
|
|
9
|
+
class Web < Base
|
|
10
|
+
include HtmlExtractor
|
|
11
|
+
include EncodingDetector
|
|
12
|
+
|
|
13
|
+
DEFAULT_MAX_REDIRECTS = 5
|
|
14
|
+
|
|
15
|
+
def load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts)
|
|
16
|
+
validate_url!(url)
|
|
17
|
+
require_nokogiri!
|
|
18
|
+
|
|
19
|
+
html, content_type = fetch(url, max_redirects: max_redirects)
|
|
20
|
+
|
|
21
|
+
detected = detect_encoding_from_content_type(content_type) ||
|
|
22
|
+
detect_encoding_from_bom(html.b)
|
|
23
|
+
html = transcode_to_utf8(html, detected) if detected
|
|
24
|
+
|
|
25
|
+
doc = parse_html(html)
|
|
26
|
+
title = extract_title(doc)
|
|
27
|
+
content = extract_text(doc)
|
|
28
|
+
|
|
29
|
+
Document.new(
|
|
30
|
+
content: content,
|
|
31
|
+
metadata: build_metadata(url,
|
|
32
|
+
format: :web,
|
|
33
|
+
title: title
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS)
|
|
39
|
+
visited = Set.new
|
|
40
|
+
queue = [start_url]
|
|
41
|
+
documents = []
|
|
42
|
+
|
|
43
|
+
while queue.any? && documents.size < max_pages
|
|
44
|
+
url = queue.shift
|
|
45
|
+
next if visited.include?(url)
|
|
46
|
+
|
|
47
|
+
visited << url
|
|
48
|
+
|
|
49
|
+
begin
|
|
50
|
+
doc = load(url, max_redirects: max_redirects)
|
|
51
|
+
documents << doc
|
|
52
|
+
rescue StandardError
|
|
53
|
+
next
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
documents
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def fetch(url, max_redirects:, redirects_followed: 0)
|
|
63
|
+
if redirects_followed > max_redirects
|
|
64
|
+
raise TooManyRedirectsError,
|
|
65
|
+
"Too many redirects (followed #{redirects_followed}, max: #{max_redirects})"
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
uri = URI.parse(url)
|
|
69
|
+
config = LoaderRuby.configuration
|
|
70
|
+
|
|
71
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
72
|
+
http.use_ssl = uri.scheme == "https"
|
|
73
|
+
http.read_timeout = config.http_timeout
|
|
74
|
+
|
|
75
|
+
req = Net::HTTP::Get.new(uri.request_uri)
|
|
76
|
+
req["User-Agent"] = config.web_user_agent
|
|
77
|
+
|
|
78
|
+
response = http.request(req)
|
|
79
|
+
|
|
80
|
+
case response.code.to_i
|
|
81
|
+
when 200..299
|
|
82
|
+
[response.body, response["Content-Type"]]
|
|
83
|
+
when 301, 302, 303, 307, 308
|
|
84
|
+
location = response["Location"]
|
|
85
|
+
# Handle relative redirects
|
|
86
|
+
location = URI.join(url, location).to_s unless location.start_with?("http")
|
|
87
|
+
fetch(location, max_redirects: max_redirects, redirects_followed: redirects_followed + 1)
|
|
88
|
+
else
|
|
89
|
+
raise Error, "HTTP #{response.code} fetching #{url}"
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
data/lib/loader_ruby.rb
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "loader_ruby/version"
|
|
4
|
+
require_relative "loader_ruby/error"
|
|
5
|
+
require_relative "loader_ruby/configuration"
|
|
6
|
+
require_relative "loader_ruby/document"
|
|
7
|
+
require_relative "loader_ruby/html_extractor"
|
|
8
|
+
require_relative "loader_ruby/encoding_detector"
|
|
9
|
+
require_relative "loader_ruby/loaders/base"
|
|
10
|
+
require_relative "loader_ruby/loaders/text"
|
|
11
|
+
require_relative "loader_ruby/loaders/pdf"
|
|
12
|
+
require_relative "loader_ruby/loaders/docx"
|
|
13
|
+
require_relative "loader_ruby/loaders/csv"
|
|
14
|
+
require_relative "loader_ruby/loaders/html"
|
|
15
|
+
require_relative "loader_ruby/loaders/web"
|
|
16
|
+
|
|
17
|
+
module LoaderRuby
|
|
18
|
+
FORMAT_MAP = {
|
|
19
|
+
".txt" => Loaders::Text, ".md" => Loaders::Text, ".markdown" => Loaders::Text,
|
|
20
|
+
".text" => Loaders::Text, ".log" => Loaders::Text, ".rst" => Loaders::Text,
|
|
21
|
+
".pdf" => Loaders::Pdf,
|
|
22
|
+
".docx" => Loaders::Docx,
|
|
23
|
+
".csv" => Loaders::Csv, ".tsv" => Loaders::Csv,
|
|
24
|
+
".html" => Loaders::Html, ".htm" => Loaders::Html
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
class << self
|
|
28
|
+
def configuration
|
|
29
|
+
@configuration ||= Configuration.new
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def configure
|
|
33
|
+
yield(configuration)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def reset_configuration!
|
|
37
|
+
@configuration = Configuration.new
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def load(source, **opts)
|
|
41
|
+
raise ArgumentError, "source cannot be nil" if source.nil?
|
|
42
|
+
raise ArgumentError, "source cannot be empty" if source.is_a?(String) && source.strip.empty?
|
|
43
|
+
|
|
44
|
+
if source.start_with?("http://", "https://")
|
|
45
|
+
Loaders::Web.new.load(source, **opts)
|
|
46
|
+
else
|
|
47
|
+
ext = File.extname(source).downcase
|
|
48
|
+
loader_class = FORMAT_MAP[ext]
|
|
49
|
+
raise UnsupportedFormatError, "Unsupported format: #{ext}" unless loader_class
|
|
50
|
+
|
|
51
|
+
loader_class.new.load(source, **opts)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def load_batch(sources, **opts)
|
|
56
|
+
sources.map { |source| load(source, **opts) }
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
data/loader-ruby.gemspec
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/loader_ruby/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "loader-ruby"
|
|
7
|
+
spec.version = LoaderRuby::VERSION
|
|
8
|
+
spec.authors = ["Johannes Dwi Cahyo"]
|
|
9
|
+
spec.email = ["johannes@example.com"]
|
|
10
|
+
spec.summary = "Document loader library for Ruby RAG pipelines"
|
|
11
|
+
spec.description = "Document extraction for RAG pipelines. Loads PDF, DOCX, CSV, HTML, and web pages into a normalized Document format for chunking and embedding."
|
|
12
|
+
spec.homepage = "https://github.com/johannesdwicahyo/loader-ruby"
|
|
13
|
+
spec.license = "MIT"
|
|
14
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
15
|
+
|
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
17
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
18
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
19
|
+
|
|
20
|
+
spec.files = Dir[
|
|
21
|
+
"lib/**/*.rb",
|
|
22
|
+
"README.md",
|
|
23
|
+
"LICENSE",
|
|
24
|
+
"CHANGELOG.md",
|
|
25
|
+
"Rakefile",
|
|
26
|
+
"loader-ruby.gemspec"
|
|
27
|
+
]
|
|
28
|
+
spec.require_paths = ["lib"]
|
|
29
|
+
|
|
30
|
+
spec.add_dependency "csv"
|
|
31
|
+
|
|
32
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
|
33
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
34
|
+
spec.add_development_dependency "webmock", "~> 3.0"
|
|
35
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: loader-ruby
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Johannes Dwi Cahyo
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: csv
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: minitest
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '5.0'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '5.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rake
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '13.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '13.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: webmock
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '3.0'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '3.0'
|
|
68
|
+
description: Document extraction for RAG pipelines. Loads PDF, DOCX, CSV, HTML, and
|
|
69
|
+
web pages into a normalized Document format for chunking and embedding.
|
|
70
|
+
email:
|
|
71
|
+
- johannes@example.com
|
|
72
|
+
executables: []
|
|
73
|
+
extensions: []
|
|
74
|
+
extra_rdoc_files: []
|
|
75
|
+
files:
|
|
76
|
+
- CHANGELOG.md
|
|
77
|
+
- LICENSE
|
|
78
|
+
- README.md
|
|
79
|
+
- Rakefile
|
|
80
|
+
- lib/loader_ruby.rb
|
|
81
|
+
- lib/loader_ruby/configuration.rb
|
|
82
|
+
- lib/loader_ruby/document.rb
|
|
83
|
+
- lib/loader_ruby/encoding_detector.rb
|
|
84
|
+
- lib/loader_ruby/error.rb
|
|
85
|
+
- lib/loader_ruby/html_extractor.rb
|
|
86
|
+
- lib/loader_ruby/loaders/base.rb
|
|
87
|
+
- lib/loader_ruby/loaders/csv.rb
|
|
88
|
+
- lib/loader_ruby/loaders/docx.rb
|
|
89
|
+
- lib/loader_ruby/loaders/html.rb
|
|
90
|
+
- lib/loader_ruby/loaders/pdf.rb
|
|
91
|
+
- lib/loader_ruby/loaders/text.rb
|
|
92
|
+
- lib/loader_ruby/loaders/web.rb
|
|
93
|
+
- lib/loader_ruby/version.rb
|
|
94
|
+
- loader-ruby.gemspec
|
|
95
|
+
homepage: https://github.com/johannesdwicahyo/loader-ruby
|
|
96
|
+
licenses:
|
|
97
|
+
- MIT
|
|
98
|
+
metadata:
|
|
99
|
+
homepage_uri: https://github.com/johannesdwicahyo/loader-ruby
|
|
100
|
+
source_code_uri: https://github.com/johannesdwicahyo/loader-ruby
|
|
101
|
+
changelog_uri: https://github.com/johannesdwicahyo/loader-ruby/blob/main/CHANGELOG.md
|
|
102
|
+
rdoc_options: []
|
|
103
|
+
require_paths:
|
|
104
|
+
- lib
|
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - ">="
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: 3.0.0
|
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
|
+
requirements:
|
|
112
|
+
- - ">="
|
|
113
|
+
- !ruby/object:Gem::Version
|
|
114
|
+
version: '0'
|
|
115
|
+
requirements: []
|
|
116
|
+
rubygems_version: 3.6.9
|
|
117
|
+
specification_version: 4
|
|
118
|
+
summary: Document loader library for Ruby RAG pipelines
|
|
119
|
+
test_files: []
|