loader-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -16
- data/lib/loader_ruby/format_detector.rb +44 -0
- data/lib/loader_ruby/loaders/email.rb +48 -0
- data/lib/loader_ruby/loaders/epub.rb +49 -0
- data/lib/loader_ruby/loaders/json_loader.rb +40 -0
- data/lib/loader_ruby/loaders/rtf.rb +30 -0
- data/lib/loader_ruby/loaders/xlsx.rb +62 -0
- data/lib/loader_ruby/loaders/xml.rb +28 -0
- data/lib/loader_ruby/parallel_loader.rb +41 -0
- data/lib/loader_ruby/streaming_loader.rb +31 -0
- data/lib/loader_ruby/version.rb +1 -1
- data/lib/loader_ruby.rb +16 -1
- metadata +10 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f2eda1fbd4867015dffcc0364079276820546d4fcaf997a0cd80234f8f5a87b9
|
|
4
|
+
data.tar.gz: ca732a300d2474380b5bfe4c691198f3c7df44818fac399541bf94c309fca447
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fdcd3ac61ecca175d77f3060d58dfbcb9f5e104e7dc52e3fc295afd6066f46073ba15582837aa0aa5f1395411ccdb31a1d2d46a4f68283f217d437ba80d8d51f
|
|
7
|
+
data.tar.gz: ab31d1b87da052ae16dff6d93aa215901e208b3480cf4aef46a143cf065714ae2139d5d901ebb8eeaead0427a49412efe7a30aa34248f0694e042c070543f330
|
data/README.md
CHANGED
|
@@ -1,16 +1,11 @@
|
|
|
1
1
|
# loader-ruby
|
|
2
2
|
|
|
3
|
-
Document loader library for Ruby RAG pipelines.
|
|
3
|
+
Document loader library for Ruby RAG pipelines. Load text from PDF, HTML, CSV, DOCX, and web URLs.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
7
7
|
```ruby
|
|
8
|
-
gem "loader-ruby"
|
|
9
|
-
|
|
10
|
-
# Optional dependencies for specific formats:
|
|
11
|
-
gem "pdf-reader" # PDF support
|
|
12
|
-
gem "nokogiri" # HTML/web support
|
|
13
|
-
gem "docx" # DOCX support
|
|
8
|
+
gem "loader-ruby"
|
|
14
9
|
```
|
|
15
10
|
|
|
16
11
|
## Usage
|
|
@@ -18,21 +13,33 @@ gem "docx" # DOCX support
|
|
|
18
13
|
```ruby
|
|
19
14
|
require "loader_ruby"
|
|
20
15
|
|
|
21
|
-
|
|
22
|
-
doc
|
|
23
|
-
doc.metadata # => { source: "document.pdf", format: :pdf, pages: 12, ... }
|
|
24
|
-
|
|
25
|
-
doc = LoaderRuby.load("notes.md")
|
|
26
|
-
|
|
16
|
+
# Auto-detect format from file extension
|
|
17
|
+
doc = LoaderRuby.load("report.pdf")
|
|
27
18
|
doc = LoaderRuby.load("data.csv")
|
|
19
|
+
doc = LoaderRuby.load("page.html")
|
|
28
20
|
|
|
29
|
-
|
|
21
|
+
# Web loader with redirect handling
|
|
22
|
+
doc = LoaderRuby.load("https://example.com/article")
|
|
30
23
|
|
|
31
|
-
|
|
24
|
+
# PDF with password
|
|
25
|
+
loader = LoaderRuby::Loaders::Pdf.new("encrypted.pdf", password: "secret")
|
|
26
|
+
doc = loader.load
|
|
32
27
|
|
|
33
|
-
|
|
28
|
+
# Access content
|
|
29
|
+
doc.content # => extracted text
|
|
30
|
+
doc.metadata # => { source: "report.pdf", ... }
|
|
34
31
|
```
|
|
35
32
|
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- PDF, HTML, CSV, DOCX, and plain text loaders
|
|
36
|
+
- Web loader with configurable max redirects (default: 5)
|
|
37
|
+
- Encoding auto-detection (BOM, Content-Type charset)
|
|
38
|
+
- Graceful transcoding to UTF-8
|
|
39
|
+
- Shared HTML extraction module
|
|
40
|
+
- Error hierarchy (FileNotFoundError, TooManyRedirectsError, etc.)
|
|
41
|
+
- Input validation for paths and URLs
|
|
42
|
+
|
|
36
43
|
## License
|
|
37
44
|
|
|
38
45
|
MIT
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module FormatDetector
|
|
5
|
+
MAGIC_BYTES = {
|
|
6
|
+
pdf: [0x25, 0x50, 0x44, 0x46], # %PDF
|
|
7
|
+
zip: [0x50, 0x4B, 0x03, 0x04], # PK (XLSX, DOCX, EPUB)
|
|
8
|
+
rtf: [0x7B, 0x5C, 0x72, 0x74, 0x66], # {\rtf
|
|
9
|
+
}.freeze
|
|
10
|
+
|
|
11
|
+
def self.detect(path)
|
|
12
|
+
return nil unless File.exist?(path)
|
|
13
|
+
|
|
14
|
+
bytes = File.binread(path, 8).bytes
|
|
15
|
+
|
|
16
|
+
MAGIC_BYTES.each do |format, signature|
|
|
17
|
+
if bytes[0, signature.length] == signature
|
|
18
|
+
return resolve_zip(path) if format == :zip
|
|
19
|
+
return format
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Fallback: try content inspection
|
|
24
|
+
content = File.read(path, 1024, encoding: "UTF-8") rescue nil
|
|
25
|
+
return nil unless content
|
|
26
|
+
|
|
27
|
+
return :json if content.strip.start_with?("{", "[")
|
|
28
|
+
return :email if content.match?(/\AFrom:|Subject:|Content-Type:/i)
|
|
29
|
+
return :html if content.match?(/<html|<!DOCTYPE html/i)
|
|
30
|
+
return :xml if content.strip.start_with?("<?xml", "<")
|
|
31
|
+
|
|
32
|
+
nil
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def self.resolve_zip(path)
|
|
36
|
+
# Peek inside ZIP to determine specific format
|
|
37
|
+
content = File.binread(path, 2048)
|
|
38
|
+
return :docx if content.include?("word/document.xml")
|
|
39
|
+
return :xlsx if content.include?("xl/workbook.xml")
|
|
40
|
+
return :epub if content.include?("META-INF/container.xml") || content.include?("mimetype")
|
|
41
|
+
:zip
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Email < Base
|
|
6
|
+
def load(path, **opts)
|
|
7
|
+
check_file_exists!(path)
|
|
8
|
+
check_file_size!(path)
|
|
9
|
+
|
|
10
|
+
raw = File.read(path)
|
|
11
|
+
headers, body = parse_email(raw)
|
|
12
|
+
|
|
13
|
+
Document.new(
|
|
14
|
+
content: body,
|
|
15
|
+
metadata: build_metadata(path, format: :email,
|
|
16
|
+
subject: headers["subject"],
|
|
17
|
+
from: headers["from"],
|
|
18
|
+
to: headers["to"],
|
|
19
|
+
date: headers["date"])
|
|
20
|
+
)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def parse_email(raw)
|
|
26
|
+
# Split headers and body at first blank line
|
|
27
|
+
parts = raw.split(/\r?\n\r?\n/, 2)
|
|
28
|
+
header_text = parts[0] || ""
|
|
29
|
+
body = parts[1] || ""
|
|
30
|
+
|
|
31
|
+
headers = {}
|
|
32
|
+
header_text.split(/\r?\n/).each do |line|
|
|
33
|
+
if line.match?(/\A\S+:/)
|
|
34
|
+
key, value = line.split(":", 2)
|
|
35
|
+
headers[key.strip.downcase] = value&.strip
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Strip HTML from body if it looks like HTML
|
|
40
|
+
if body.include?("<html") || body.include?("<body")
|
|
41
|
+
body = body.gsub(/<[^>]+>/, " ").gsub(/\s+/, " ").strip
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
[headers, body.strip]
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zip" if defined?(Zip) || begin; require "zip"; rescue LoadError; false; end
|
|
4
|
+
|
|
5
|
+
module LoaderRuby
|
|
6
|
+
module Loaders
|
|
7
|
+
class Epub < Base
|
|
8
|
+
def load(path, **opts)
|
|
9
|
+
check_file_exists!(path)
|
|
10
|
+
check_file_size!(path)
|
|
11
|
+
|
|
12
|
+
begin
|
|
13
|
+
require "zip"
|
|
14
|
+
rescue LoadError
|
|
15
|
+
raise DependencyMissingError, "rubyzip gem is required for EPUB loading"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
content = extract_text(path)
|
|
19
|
+
|
|
20
|
+
Document.new(
|
|
21
|
+
content: content,
|
|
22
|
+
metadata: build_metadata(path, format: :epub)
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def extract_text(path)
|
|
29
|
+
texts = []
|
|
30
|
+
Zip::File.open(path) do |zip|
|
|
31
|
+
zip.each do |entry|
|
|
32
|
+
next unless entry.name.end_with?(".xhtml", ".html", ".htm")
|
|
33
|
+
html = entry.get_input_stream.read
|
|
34
|
+
texts << strip_html(html)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
texts.join("\n\n")
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def strip_html(html)
|
|
41
|
+
html.gsub(/<script[^>]*>.*?<\/script>/m, "")
|
|
42
|
+
.gsub(/<style[^>]*>.*?<\/style>/m, "")
|
|
43
|
+
.gsub(/<[^>]+>/, " ")
|
|
44
|
+
.gsub(/\s+/, " ")
|
|
45
|
+
.strip
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module LoaderRuby
|
|
6
|
+
module Loaders
|
|
7
|
+
class Json < Base
|
|
8
|
+
def load(path, text_key: nil, **opts)
|
|
9
|
+
check_file_exists!(path)
|
|
10
|
+
check_file_size!(path)
|
|
11
|
+
|
|
12
|
+
raw = File.read(path)
|
|
13
|
+
data = ::JSON.parse(raw)
|
|
14
|
+
|
|
15
|
+
content = if text_key
|
|
16
|
+
extract_by_key(data, text_key)
|
|
17
|
+
else
|
|
18
|
+
::JSON.pretty_generate(data)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
Document.new(
|
|
22
|
+
content: content,
|
|
23
|
+
metadata: build_metadata(path, format: :json, keys: data.is_a?(Hash) ? data.keys : nil)
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def extract_by_key(data, key)
|
|
30
|
+
if data.is_a?(Array)
|
|
31
|
+
data.map { |item| item.is_a?(Hash) ? item[key].to_s : item.to_s }.join("\n")
|
|
32
|
+
elsif data.is_a?(Hash)
|
|
33
|
+
data[key].to_s
|
|
34
|
+
else
|
|
35
|
+
data.to_s
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Rtf < Base
|
|
6
|
+
def load(path, **opts)
|
|
7
|
+
check_file_exists!(path)
|
|
8
|
+
check_file_size!(path)
|
|
9
|
+
|
|
10
|
+
raw = File.read(path)
|
|
11
|
+
content = strip_rtf(raw)
|
|
12
|
+
|
|
13
|
+
Document.new(
|
|
14
|
+
content: content,
|
|
15
|
+
metadata: build_metadata(path, format: :rtf)
|
|
16
|
+
)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def strip_rtf(text)
|
|
22
|
+
# Remove RTF control words, keep plain text
|
|
23
|
+
text = text.gsub(/\\[a-z]+\d*[ ]?/i, "") # Remove control words
|
|
24
|
+
text = text.gsub(/[{}]/, "") # Remove braces
|
|
25
|
+
text = text.gsub(/\s+/, " ").strip
|
|
26
|
+
text
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Xlsx < Base
|
|
6
|
+
def load(path, sheet: nil, row_as_document: false, **opts)
|
|
7
|
+
check_file_exists!(path)
|
|
8
|
+
check_file_size!(path)
|
|
9
|
+
|
|
10
|
+
begin
|
|
11
|
+
require "roo"
|
|
12
|
+
rescue LoadError
|
|
13
|
+
raise DependencyMissingError, "roo gem is required for XLSX loading"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
workbook = Roo::Spreadsheet.open(path)
|
|
17
|
+
worksheet = sheet ? workbook.sheet(sheet) : workbook.sheet(0)
|
|
18
|
+
|
|
19
|
+
if row_as_document
|
|
20
|
+
load_rows(path, worksheet)
|
|
21
|
+
else
|
|
22
|
+
load_all(path, worksheet)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def load_all(path, worksheet)
|
|
29
|
+
rows = []
|
|
30
|
+
worksheet.each_row_streaming do |row|
|
|
31
|
+
rows << row.map { |cell| cell&.value.to_s }.join("\t")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
Document.new(
|
|
35
|
+
content: rows.join("\n"),
|
|
36
|
+
metadata: build_metadata(path, format: :xlsx, rows: rows.size)
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def load_rows(path, worksheet)
|
|
41
|
+
headers = nil
|
|
42
|
+
documents = []
|
|
43
|
+
|
|
44
|
+
worksheet.each_row_streaming.each_with_index do |row, i|
|
|
45
|
+
values = row.map { |cell| cell&.value.to_s }
|
|
46
|
+
if i == 0
|
|
47
|
+
headers = values
|
|
48
|
+
next
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
content = headers ? headers.zip(values).map { |k, v| "#{k}: #{v}" }.join("\n") : values.join("\t")
|
|
52
|
+
documents << Document.new(
|
|
53
|
+
content: content,
|
|
54
|
+
metadata: build_metadata(path, format: :xlsx, row_index: i, headers: headers)
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
documents
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
module Loaders
|
|
5
|
+
class Xml < Base
|
|
6
|
+
def load(path, **opts)
|
|
7
|
+
check_file_exists!(path)
|
|
8
|
+
check_file_size!(path)
|
|
9
|
+
|
|
10
|
+
begin
|
|
11
|
+
require "nokogiri"
|
|
12
|
+
rescue LoadError
|
|
13
|
+
raise DependencyMissingError, "nokogiri gem is required for XML loading"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
raw = File.read(path)
|
|
17
|
+
doc = Nokogiri::XML(raw)
|
|
18
|
+
|
|
19
|
+
content = doc.text.gsub(/\s+/, " ").strip
|
|
20
|
+
|
|
21
|
+
Document.new(
|
|
22
|
+
content: content,
|
|
23
|
+
metadata: build_metadata(path, format: :xml, root: doc.root&.name)
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
class ParallelLoader
|
|
5
|
+
def initialize(threads: 4)
|
|
6
|
+
@threads = threads
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def load(sources, **opts)
|
|
10
|
+
return sources.map { |s| LoaderRuby.load(s, **opts) } if @threads <= 1
|
|
11
|
+
|
|
12
|
+
results = Array.new(sources.size)
|
|
13
|
+
errors = []
|
|
14
|
+
mutex = Mutex.new
|
|
15
|
+
|
|
16
|
+
work_queue = Queue.new
|
|
17
|
+
sources.each_with_index { |s, i| work_queue << [s, i] }
|
|
18
|
+
@threads.times { work_queue << nil } # Poison pills
|
|
19
|
+
|
|
20
|
+
threads = @threads.times.map do
|
|
21
|
+
Thread.new do
|
|
22
|
+
while (item = work_queue.pop)
|
|
23
|
+
source, index = item
|
|
24
|
+
begin
|
|
25
|
+
doc = LoaderRuby.load(source, **opts)
|
|
26
|
+
mutex.synchronize { results[index] = doc }
|
|
27
|
+
rescue => e
|
|
28
|
+
mutex.synchronize { errors << { source: source, error: e } }
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
threads.each(&:join)
|
|
35
|
+
|
|
36
|
+
raise Error, "#{errors.size} files failed to load" if errors.any? && results.compact.empty?
|
|
37
|
+
|
|
38
|
+
results
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LoaderRuby
|
|
4
|
+
class StreamingLoader
|
|
5
|
+
DEFAULT_CHUNK_SIZE = 64 * 1024 # 64KB
|
|
6
|
+
|
|
7
|
+
def initialize(chunk_size: DEFAULT_CHUNK_SIZE)
|
|
8
|
+
@chunk_size = chunk_size
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def load(path, &block)
|
|
12
|
+
raise ArgumentError, "Block required for streaming" unless block_given?
|
|
13
|
+
raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
|
|
14
|
+
|
|
15
|
+
File.open(path, "rb") do |file|
|
|
16
|
+
while (chunk = file.read(@chunk_size))
|
|
17
|
+
yield chunk
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def load_lines(path, &block)
|
|
23
|
+
raise ArgumentError, "Block required for streaming" unless block_given?
|
|
24
|
+
raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
|
|
25
|
+
|
|
26
|
+
File.foreach(path) do |line|
|
|
27
|
+
yield line
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
data/lib/loader_ruby/version.rb
CHANGED
data/lib/loader_ruby.rb
CHANGED
|
@@ -13,6 +13,15 @@ require_relative "loader_ruby/loaders/docx"
|
|
|
13
13
|
require_relative "loader_ruby/loaders/csv"
|
|
14
14
|
require_relative "loader_ruby/loaders/html"
|
|
15
15
|
require_relative "loader_ruby/loaders/web"
|
|
16
|
+
require_relative "loader_ruby/loaders/json_loader"
|
|
17
|
+
require_relative "loader_ruby/loaders/xml"
|
|
18
|
+
require_relative "loader_ruby/loaders/epub"
|
|
19
|
+
require_relative "loader_ruby/loaders/rtf"
|
|
20
|
+
require_relative "loader_ruby/loaders/email"
|
|
21
|
+
require_relative "loader_ruby/loaders/xlsx"
|
|
22
|
+
require_relative "loader_ruby/format_detector"
|
|
23
|
+
require_relative "loader_ruby/parallel_loader"
|
|
24
|
+
require_relative "loader_ruby/streaming_loader"
|
|
16
25
|
|
|
17
26
|
module LoaderRuby
|
|
18
27
|
FORMAT_MAP = {
|
|
@@ -21,7 +30,13 @@ module LoaderRuby
|
|
|
21
30
|
".pdf" => Loaders::Pdf,
|
|
22
31
|
".docx" => Loaders::Docx,
|
|
23
32
|
".csv" => Loaders::Csv, ".tsv" => Loaders::Csv,
|
|
24
|
-
".html" => Loaders::Html, ".htm" => Loaders::Html
|
|
33
|
+
".html" => Loaders::Html, ".htm" => Loaders::Html,
|
|
34
|
+
".json" => Loaders::Json,
|
|
35
|
+
".xml" => Loaders::Xml,
|
|
36
|
+
".epub" => Loaders::Epub,
|
|
37
|
+
".rtf" => Loaders::Rtf,
|
|
38
|
+
".eml" => Loaders::Email,
|
|
39
|
+
".xlsx" => Loaders::Xlsx, ".xls" => Loaders::Xlsx
|
|
25
40
|
}.freeze
|
|
26
41
|
|
|
27
42
|
class << self
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: loader-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Johannes Dwi Cahyo
|
|
@@ -82,14 +82,23 @@ files:
|
|
|
82
82
|
- lib/loader_ruby/document.rb
|
|
83
83
|
- lib/loader_ruby/encoding_detector.rb
|
|
84
84
|
- lib/loader_ruby/error.rb
|
|
85
|
+
- lib/loader_ruby/format_detector.rb
|
|
85
86
|
- lib/loader_ruby/html_extractor.rb
|
|
86
87
|
- lib/loader_ruby/loaders/base.rb
|
|
87
88
|
- lib/loader_ruby/loaders/csv.rb
|
|
88
89
|
- lib/loader_ruby/loaders/docx.rb
|
|
90
|
+
- lib/loader_ruby/loaders/email.rb
|
|
91
|
+
- lib/loader_ruby/loaders/epub.rb
|
|
89
92
|
- lib/loader_ruby/loaders/html.rb
|
|
93
|
+
- lib/loader_ruby/loaders/json_loader.rb
|
|
90
94
|
- lib/loader_ruby/loaders/pdf.rb
|
|
95
|
+
- lib/loader_ruby/loaders/rtf.rb
|
|
91
96
|
- lib/loader_ruby/loaders/text.rb
|
|
92
97
|
- lib/loader_ruby/loaders/web.rb
|
|
98
|
+
- lib/loader_ruby/loaders/xlsx.rb
|
|
99
|
+
- lib/loader_ruby/loaders/xml.rb
|
|
100
|
+
- lib/loader_ruby/parallel_loader.rb
|
|
101
|
+
- lib/loader_ruby/streaming_loader.rb
|
|
93
102
|
- lib/loader_ruby/version.rb
|
|
94
103
|
- loader-ruby.gemspec
|
|
95
104
|
homepage: https://github.com/johannesdwicahyo/loader-ruby
|