loader-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 663d36e070a5a41ba1c34ff5cc7d734b34f92eebbb7a5e104d7e2ceb7f4854f0
4
+ data.tar.gz: fbea2f56dce6dfca84ff898fa3626c0ec56092d3466fdc82f0327f9cd937a5ba
5
+ SHA512:
6
+ metadata.gz: 9cb3f7f935f3779a68524f11af2c734d2219b68c1a7d3b8c0cd841a71052452146d55819d6e91938da7d8904b7d028899beb5b835cd9354be621f6e2ad9de344
7
+ data.tar.gz: 59df44fe24d1e8a7d15bf48e9da9f72fca101a0ea8ae81b46c9cf9bfc45b86c65dfa5388def61a4ec86bc7a0d7e0b354bd31a214490cddbd75fb78e8b8d928fd
data/CHANGELOG.md ADDED
@@ -0,0 +1,14 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2026-03-09)
4
+
5
+ - Initial release
6
+ - Text/Markdown loader
7
+ - PDF loader (via pdf-reader gem)
8
+ - DOCX loader (via docx gem)
9
+ - CSV/TSV loader with row-as-document support
10
+ - HTML loader (via nokogiri gem)
11
+ - Web page loader with HTTP fetching
12
+ - Auto-detection of file format
13
+ - Normalized Document result object
14
+ - Configuration DSL
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Johannes Dwi Cahyo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # loader-ruby
2
+
3
+ Document loader library for Ruby RAG pipelines. Extracts text from PDF, DOCX, CSV, HTML, and web pages.
4
+
5
+ ## Installation
6
+
7
+ ```ruby
8
+ gem "loader-ruby", "~> 0.1"
9
+
10
+ # Optional dependencies for specific formats:
11
+ gem "pdf-reader" # PDF support
12
+ gem "nokogiri" # HTML/web support
13
+ gem "docx" # DOCX support
14
+ ```
15
+
16
+ ## Usage
17
+
18
+ ```ruby
19
+ require "loader_ruby"
20
+
21
+ doc = LoaderRuby.load("document.pdf")
22
+ doc.content # => extracted text
23
+ doc.metadata # => { source: "document.pdf", format: :pdf, pages: 12, ... }
24
+
25
+ doc = LoaderRuby.load("notes.md")
26
+
27
+ doc = LoaderRuby.load("data.csv")
28
+
29
+ docs = LoaderRuby::Loaders::Csv.new.load("data.csv", row_as_document: true)
30
+
31
+ doc = LoaderRuby.load("https://example.com/page")
32
+
33
+ docs = LoaderRuby.load_batch(["file1.pdf", "file2.docx"])
34
+ ```
35
+
36
+ ## License
37
+
38
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList["test/**/test_*.rb"]
9
+ end
10
+
11
+ task default: :test
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ class Configuration
5
+ attr_accessor :default_encoding, :max_file_size, :http_timeout,
6
+ :web_user_agent
7
+
8
+ def initialize
9
+ @default_encoding = "UTF-8"
10
+ @max_file_size = 100 * 1024 * 1024
11
+ @http_timeout = 30
12
+ @web_user_agent = "LoaderRuby/#{VERSION}"
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ class Document
5
+ attr_reader :content, :metadata
6
+
7
+ def initialize(content:, metadata: {})
8
+ @content = content
9
+ @metadata = metadata
10
+ end
11
+
12
+ def source
13
+ @metadata[:source]
14
+ end
15
+
16
+ def format
17
+ @metadata[:format]
18
+ end
19
+
20
+ def pages
21
+ @metadata[:pages]
22
+ end
23
+
24
+ def size
25
+ @content.length
26
+ end
27
+
28
+ def empty?
29
+ @content.nil? || @content.strip.empty?
30
+ end
31
+
32
+ def to_h
33
+ {
34
+ content: @content,
35
+ metadata: @metadata
36
+ }
37
+ end
38
+
39
+ def to_s
40
+ "Document(source: #{source}, format: #{self.format}, size: #{size})"
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ # Detects file encoding from BOM or content-type header and transcodes to UTF-8.
5
+ module EncodingDetector
6
+ BOM_MAP = {
7
+ "\xEF\xBB\xBF".b => "UTF-8",
8
+ "\xFF\xFE".b => "UTF-16LE",
9
+ "\xFE\xFF".b => "UTF-16BE",
10
+ "\xFF\xFE\x00\x00".b => "UTF-32LE",
11
+ "\x00\x00\xFE\xFF".b => "UTF-32BE"
12
+ }.freeze
13
+
14
+ private
15
+
16
+ # Detect encoding from BOM bytes at the start of raw content.
17
+ def detect_encoding_from_bom(raw_bytes)
18
+ # Check 4-byte BOMs first, then 3-byte, then 2-byte
19
+ if raw_bytes.bytesize >= 4
20
+ bom4 = raw_bytes.byteslice(0, 4)
21
+ return BOM_MAP[bom4] if BOM_MAP.key?(bom4)
22
+ end
23
+
24
+ if raw_bytes.bytesize >= 3
25
+ bom3 = raw_bytes.byteslice(0, 3)
26
+ return BOM_MAP[bom3] if BOM_MAP.key?(bom3)
27
+ end
28
+
29
+ if raw_bytes.bytesize >= 2
30
+ bom2 = raw_bytes.byteslice(0, 2)
31
+ return BOM_MAP[bom2] if BOM_MAP.key?(bom2)
32
+ end
33
+
34
+ nil
35
+ end
36
+
37
+ # Detect encoding from a Content-Type header value, e.g. "text/html; charset=iso-8859-1"
38
+ def detect_encoding_from_content_type(content_type)
39
+ return nil unless content_type
40
+
41
+ if content_type =~ /charset=([^\s;]+)/i
42
+ $1.strip
43
+ end
44
+ end
45
+
46
+ # Transcode content to UTF-8 from the detected or specified encoding.
47
+ # Returns a UTF-8 encoded string with invalid/undefined bytes replaced.
48
+ def transcode_to_utf8(content, source_encoding)
49
+ return content if source_encoding.nil?
50
+
51
+ normalized = source_encoding.upcase.strip
52
+ return content if normalized == "UTF-8" && content.encoding == ::Encoding::UTF_8 && content.valid_encoding?
53
+
54
+ begin
55
+ content.encode("UTF-8", source_encoding, invalid: :replace, undef: :replace)
56
+ rescue ::EncodingError => e
57
+ raise LoaderRuby::EncodingError, "Failed to transcode from #{source_encoding} to UTF-8: #{e.message}"
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ class Error < StandardError; end
5
+ class FileNotFoundError < Error; end
6
+ class UnsupportedFormatError < Error; end
7
+ class FileTooLargeError < Error; end
8
+ class DependencyMissingError < Error; end
9
+ class TooManyRedirectsError < Error; end
10
+ class EncodingError < Error; end
11
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ # Shared HTML-to-text extraction logic used by both Html and Web loaders.
5
+ module HtmlExtractor
6
+ REMOVE_SELECTORS = "script, style, nav, footer, header"
7
+
8
+ private
9
+
10
+ def require_nokogiri!
11
+ require "nokogiri"
12
+ rescue LoadError
13
+ raise DependencyMissingError,
14
+ "nokogiri gem is required for HTML loading. Add `gem 'nokogiri'` to your Gemfile."
15
+ end
16
+
17
+ def parse_html(html)
18
+ doc = Nokogiri::HTML(html)
19
+ doc.css(REMOVE_SELECTORS).remove
20
+ doc
21
+ end
22
+
23
+ def extract_title(doc)
24
+ doc.at_css("title")&.text&.strip
25
+ end
26
+
27
+ def extract_text(doc)
28
+ body = doc.at_css("body") || doc
29
+ body.text.gsub(/\s+/, " ").strip
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module LoaderRuby
6
+ module Loaders
7
+ class Base
8
+ def load(source, **opts)
9
+ raise NotImplementedError, "#{self.class}#load not implemented"
10
+ end
11
+
12
+ private
13
+
14
+ def validate_path!(path)
15
+ raise ArgumentError, "path cannot be nil" if path.nil?
16
+ raise ArgumentError, "path cannot be empty" if path.is_a?(String) && path.strip.empty?
17
+ end
18
+
19
+ def validate_url!(url)
20
+ raise ArgumentError, "URL cannot be nil" if url.nil?
21
+ raise ArgumentError, "URL cannot be empty" if url.is_a?(String) && url.strip.empty?
22
+
23
+ uri = URI.parse(url)
24
+ unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
25
+ raise ArgumentError, "Invalid URL: #{url}"
26
+ end
27
+ end
28
+
29
+ def check_file_exists!(path)
30
+ validate_path!(path)
31
+ raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
32
+ end
33
+
34
+ def check_file_size!(path)
35
+ max = LoaderRuby.configuration.max_file_size
36
+ size = File.size(path)
37
+ if size > max
38
+ raise FileTooLargeError, "File too large: #{size} bytes (max: #{max})"
39
+ end
40
+ end
41
+
42
+ def build_metadata(source, format:, **extra)
43
+ {
44
+ source: source,
45
+ format: format,
46
+ loaded_at: Time.now.iso8601
47
+ }.merge(extra)
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module LoaderRuby
6
+ module Loaders
7
+ class Csv < Base
8
+ EXTENSIONS = %w[.csv .tsv].freeze
9
+
10
+ def load(path, row_as_document: false, **opts)
11
+ check_file_exists!(path)
12
+ check_file_size!(path)
13
+
14
+ separator = path.end_with?(".tsv") ? "\t" : ","
15
+ table = ::CSV.read(path, headers: true, col_sep: separator)
16
+
17
+ if row_as_document
18
+ load_rows_as_documents(path, table)
19
+ else
20
+ load_as_single_document(path, table)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def load_as_single_document(path, table)
27
+ content = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(", ") }.join("\n")
28
+
29
+ Document.new(
30
+ content: content,
31
+ metadata: build_metadata(path,
32
+ format: :csv,
33
+ rows: table.size,
34
+ headers: table.headers
35
+ )
36
+ )
37
+ end
38
+
39
+ def load_rows_as_documents(path, table)
40
+ table.map.with_index do |row, i|
41
+ content = row.to_h.map { |k, v| "#{k}: #{v}" }.join("\n")
42
+
43
+ Document.new(
44
+ content: content,
45
+ metadata: build_metadata(path,
46
+ format: :csv,
47
+ row_index: i,
48
+ headers: table.headers
49
+ )
50
+ )
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Docx < Base
6
+ EXTENSIONS = %w[.docx].freeze
7
+
8
+ def load(path, **opts)
9
+ check_file_exists!(path)
10
+ check_file_size!(path)
11
+
12
+ begin
13
+ require "docx"
14
+ rescue LoadError
15
+ raise DependencyMissingError,
16
+ "docx gem is required for DOCX loading. Add `gem 'docx'` to your Gemfile."
17
+ end
18
+
19
+ doc = ::Docx::Document.open(path)
20
+ paragraphs = doc.paragraphs.map(&:text)
21
+ content = paragraphs.join("\n")
22
+
23
+ Document.new(
24
+ content: content,
25
+ metadata: build_metadata(path,
26
+ format: :docx,
27
+ paragraphs: paragraphs.size
28
+ )
29
+ )
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Html < Base
6
+ include HtmlExtractor
7
+ include EncodingDetector
8
+
9
+ EXTENSIONS = %w[.html .htm].freeze
10
+
11
+ def load(path, **opts)
12
+ check_file_exists!(path)
13
+ check_file_size!(path)
14
+ require_nokogiri!
15
+
16
+ raw = File.binread(path)
17
+ detected = detect_encoding_from_bom(raw)
18
+ html = transcode_to_utf8(raw, detected || "UTF-8")
19
+
20
+ doc = parse_html(html)
21
+ title = extract_title(doc)
22
+ content = extract_text(doc)
23
+
24
+ Document.new(
25
+ content: content,
26
+ metadata: build_metadata(path,
27
+ format: :html,
28
+ title: title
29
+ )
30
+ )
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Pdf < Base
6
+ EXTENSIONS = %w[.pdf].freeze
7
+
8
+ def load(path, password: nil, **opts)
9
+ check_file_exists!(path)
10
+ check_file_size!(path)
11
+
12
+ begin
13
+ require "pdf-reader"
14
+ rescue LoadError
15
+ raise DependencyMissingError,
16
+ "pdf-reader gem is required for PDF loading. Add `gem 'pdf-reader'` to your Gemfile."
17
+ end
18
+
19
+ reader_opts = {}
20
+ reader_opts[:password] = password if password
21
+ reader = PDF::Reader.new(path, **reader_opts)
22
+ pages = reader.pages.map(&:text)
23
+ content = pages.join("\n\n")
24
+
25
+ Document.new(
26
+ content: content,
27
+ metadata: build_metadata(path,
28
+ format: :pdf,
29
+ pages: reader.page_count,
30
+ info: reader.info
31
+ )
32
+ )
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ module Loaders
5
+ class Text < Base
6
+ include EncodingDetector
7
+
8
+ EXTENSIONS = %w[.txt .md .markdown .text .log .rst].freeze
9
+
10
+ def load(path, **opts)
11
+ check_file_exists!(path)
12
+ check_file_size!(path)
13
+
14
+ explicit_encoding = opts[:encoding]
15
+
16
+ raw = File.binread(path)
17
+ detected = explicit_encoding || detect_encoding_from_bom(raw) || LoaderRuby.configuration.default_encoding
18
+ content = transcode_to_utf8(raw, detected)
19
+
20
+ Document.new(
21
+ content: content,
22
+ metadata: build_metadata(path, format: :text, encoding: detected)
23
+ )
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "set"
6
+
7
+ module LoaderRuby
8
+ module Loaders
9
+ class Web < Base
10
+ include HtmlExtractor
11
+ include EncodingDetector
12
+
13
+ DEFAULT_MAX_REDIRECTS = 5
14
+
15
+ def load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts)
16
+ validate_url!(url)
17
+ require_nokogiri!
18
+
19
+ html, content_type = fetch(url, max_redirects: max_redirects)
20
+
21
+ detected = detect_encoding_from_content_type(content_type) ||
22
+ detect_encoding_from_bom(html.b)
23
+ html = transcode_to_utf8(html, detected) if detected
24
+
25
+ doc = parse_html(html)
26
+ title = extract_title(doc)
27
+ content = extract_text(doc)
28
+
29
+ Document.new(
30
+ content: content,
31
+ metadata: build_metadata(url,
32
+ format: :web,
33
+ title: title
34
+ )
35
+ )
36
+ end
37
+
38
+ def crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS)
39
+ visited = Set.new
40
+ queue = [start_url]
41
+ documents = []
42
+
43
+ while queue.any? && documents.size < max_pages
44
+ url = queue.shift
45
+ next if visited.include?(url)
46
+
47
+ visited << url
48
+
49
+ begin
50
+ doc = load(url, max_redirects: max_redirects)
51
+ documents << doc
52
+ rescue StandardError
53
+ next
54
+ end
55
+ end
56
+
57
+ documents
58
+ end
59
+
60
+ private
61
+
62
+ def fetch(url, max_redirects:, redirects_followed: 0)
63
+ if redirects_followed > max_redirects
64
+ raise TooManyRedirectsError,
65
+ "Too many redirects (followed #{redirects_followed}, max: #{max_redirects})"
66
+ end
67
+
68
+ uri = URI.parse(url)
69
+ config = LoaderRuby.configuration
70
+
71
+ http = Net::HTTP.new(uri.host, uri.port)
72
+ http.use_ssl = uri.scheme == "https"
73
+ http.read_timeout = config.http_timeout
74
+
75
+ req = Net::HTTP::Get.new(uri.request_uri)
76
+ req["User-Agent"] = config.web_user_agent
77
+
78
+ response = http.request(req)
79
+
80
+ case response.code.to_i
81
+ when 200..299
82
+ [response.body, response["Content-Type"]]
83
+ when 301, 302, 303, 307, 308
84
+ location = response["Location"]
85
+ # Handle relative redirects
86
+ location = URI.join(url, location).to_s unless location.start_with?("http")
87
+ fetch(location, max_redirects: max_redirects, redirects_followed: redirects_followed + 1)
88
+ else
89
+ raise Error, "HTTP #{response.code} fetching #{url}"
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LoaderRuby
4
+ VERSION = "0.1.1"
5
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "loader_ruby/version"
4
+ require_relative "loader_ruby/error"
5
+ require_relative "loader_ruby/configuration"
6
+ require_relative "loader_ruby/document"
7
+ require_relative "loader_ruby/html_extractor"
8
+ require_relative "loader_ruby/encoding_detector"
9
+ require_relative "loader_ruby/loaders/base"
10
+ require_relative "loader_ruby/loaders/text"
11
+ require_relative "loader_ruby/loaders/pdf"
12
+ require_relative "loader_ruby/loaders/docx"
13
+ require_relative "loader_ruby/loaders/csv"
14
+ require_relative "loader_ruby/loaders/html"
15
+ require_relative "loader_ruby/loaders/web"
16
+
17
+ module LoaderRuby
18
+ FORMAT_MAP = {
19
+ ".txt" => Loaders::Text, ".md" => Loaders::Text, ".markdown" => Loaders::Text,
20
+ ".text" => Loaders::Text, ".log" => Loaders::Text, ".rst" => Loaders::Text,
21
+ ".pdf" => Loaders::Pdf,
22
+ ".docx" => Loaders::Docx,
23
+ ".csv" => Loaders::Csv, ".tsv" => Loaders::Csv,
24
+ ".html" => Loaders::Html, ".htm" => Loaders::Html
25
+ }.freeze
26
+
27
+ class << self
28
+ def configuration
29
+ @configuration ||= Configuration.new
30
+ end
31
+
32
+ def configure
33
+ yield(configuration)
34
+ end
35
+
36
+ def reset_configuration!
37
+ @configuration = Configuration.new
38
+ end
39
+
40
+ def load(source, **opts)
41
+ raise ArgumentError, "source cannot be nil" if source.nil?
42
+ raise ArgumentError, "source cannot be empty" if source.is_a?(String) && source.strip.empty?
43
+
44
+ if source.start_with?("http://", "https://")
45
+ Loaders::Web.new.load(source, **opts)
46
+ else
47
+ ext = File.extname(source).downcase
48
+ loader_class = FORMAT_MAP[ext]
49
+ raise UnsupportedFormatError, "Unsupported format: #{ext}" unless loader_class
50
+
51
+ loader_class.new.load(source, **opts)
52
+ end
53
+ end
54
+
55
+ def load_batch(sources, **opts)
56
+ sources.map { |source| load(source, **opts) }
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/loader_ruby/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "loader-ruby"
7
+ spec.version = LoaderRuby::VERSION
8
+ spec.authors = ["Johannes Dwi Cahyo"]
9
+ spec.email = ["johannes@example.com"]
10
+ spec.summary = "Document loader library for Ruby RAG pipelines"
11
+ spec.description = "Document extraction for RAG pipelines. Loads PDF, DOCX, CSV, HTML, and web pages into a normalized Document format for chunking and embedding."
12
+ spec.homepage = "https://github.com/johannesdwicahyo/loader-ruby"
13
+ spec.license = "MIT"
14
+ spec.required_ruby_version = ">= 3.0.0"
15
+
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
+ spec.metadata["source_code_uri"] = spec.homepage
18
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
19
+
20
+ spec.files = Dir[
21
+ "lib/**/*.rb",
22
+ "README.md",
23
+ "LICENSE",
24
+ "CHANGELOG.md",
25
+ "Rakefile",
26
+ "loader-ruby.gemspec"
27
+ ]
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency "csv"
31
+
32
+ spec.add_development_dependency "minitest", "~> 5.0"
33
+ spec.add_development_dependency "rake", "~> 13.0"
34
+ spec.add_development_dependency "webmock", "~> 3.0"
35
+ end
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loader-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Johannes Dwi Cahyo
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: csv
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: minitest
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '5.0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '5.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rake
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '13.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '13.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: webmock
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '3.0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ description: Document extraction for RAG pipelines. Loads PDF, DOCX, CSV, HTML, and
69
+ web pages into a normalized Document format for chunking and embedding.
70
+ email:
71
+ - johannes@example.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - CHANGELOG.md
77
+ - LICENSE
78
+ - README.md
79
+ - Rakefile
80
+ - lib/loader_ruby.rb
81
+ - lib/loader_ruby/configuration.rb
82
+ - lib/loader_ruby/document.rb
83
+ - lib/loader_ruby/encoding_detector.rb
84
+ - lib/loader_ruby/error.rb
85
+ - lib/loader_ruby/html_extractor.rb
86
+ - lib/loader_ruby/loaders/base.rb
87
+ - lib/loader_ruby/loaders/csv.rb
88
+ - lib/loader_ruby/loaders/docx.rb
89
+ - lib/loader_ruby/loaders/html.rb
90
+ - lib/loader_ruby/loaders/pdf.rb
91
+ - lib/loader_ruby/loaders/text.rb
92
+ - lib/loader_ruby/loaders/web.rb
93
+ - lib/loader_ruby/version.rb
94
+ - loader-ruby.gemspec
95
+ homepage: https://github.com/johannesdwicahyo/loader-ruby
96
+ licenses:
97
+ - MIT
98
+ metadata:
99
+ homepage_uri: https://github.com/johannesdwicahyo/loader-ruby
100
+ source_code_uri: https://github.com/johannesdwicahyo/loader-ruby
101
+ changelog_uri: https://github.com/johannesdwicahyo/loader-ruby/blob/main/CHANGELOG.md
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 3.0.0
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ requirements: []
116
+ rubygems_version: 3.6.9
117
+ specification_version: 4
118
+ summary: Document loader library for Ruby RAG pipelines
119
+ test_files: []