markdownator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +20 -0
- data/CHANGELOG.md +9 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +109 -0
- data/Rakefile +12 -0
- data/lib/markdownator/converters/base.rb +69 -0
- data/lib/markdownator/converters/csv.rb +21 -0
- data/lib/markdownator/converters/docx.rb +80 -0
- data/lib/markdownator/converters/epub.rb +64 -0
- data/lib/markdownator/converters/html.rb +29 -0
- data/lib/markdownator/converters/image.rb +88 -0
- data/lib/markdownator/converters/json.rb +22 -0
- data/lib/markdownator/converters/pdf.rb +37 -0
- data/lib/markdownator/converters/plain_text.rb +22 -0
- data/lib/markdownator/converters/pptx.rb +47 -0
- data/lib/markdownator/converters/xlsx.rb +51 -0
- data/lib/markdownator/converters/xml.rb +36 -0
- data/lib/markdownator/converters/zip.rb +58 -0
- data/lib/markdownator/engine.rb +110 -0
- data/lib/markdownator/errors.rb +15 -0
- data/lib/markdownator/result.rb +23 -0
- data/lib/markdownator/stream_info.rb +73 -0
- data/lib/markdownator/version.rb +5 -0
- data/lib/markdownator.rb +56 -0
- data/markdownator.gemspec +39 -0
- data/sig/markdownator.rbs +50 -0
- metadata +79 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
module Converters
|
|
5
|
+
# Extracts text from a PDF (one block per page) using the `pdf-reader` gem.
|
|
6
|
+
class Pdf < Base
|
|
7
|
+
def accepts?(io, stream_info)
|
|
8
|
+
return true if matches?(stream_info, extensions: %w[pdf], mimetypes: %w[application/pdf])
|
|
9
|
+
|
|
10
|
+
magic_pdf?(io)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def convert(io, _stream_info, **_options)
|
|
14
|
+
Markdownator.require_optional("pdf-reader", feature: "PDF conversion")
|
|
15
|
+
|
|
16
|
+
reader = PDF::Reader.new(io)
|
|
17
|
+
pages = reader.pages.map { |page| page.text.strip }
|
|
18
|
+
pages.reject!(&:empty?)
|
|
19
|
+
Result.new(
|
|
20
|
+
markdown: pages.join("\n\n---\n\n"),
|
|
21
|
+
metadata: { page_count: reader.page_count }
|
|
22
|
+
)
|
|
23
|
+
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
|
|
24
|
+
raise FileConversionError, "Could not read PDF: #{e.message}"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def magic_pdf?(io)
|
|
30
|
+
io.rewind if io.respond_to?(:rewind)
|
|
31
|
+
io.read(5) == "%PDF-"
|
|
32
|
+
ensure
|
|
33
|
+
io.rewind if io.respond_to?(:rewind)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
module Converters
|
|
5
|
+
# Passes plain text (and Markdown) through unchanged.
|
|
6
|
+
class PlainText < Base
|
|
7
|
+
EXTENSIONS = %w[txt text md markdown].freeze
|
|
8
|
+
MIMETYPES = %w[text/plain text/markdown].freeze
|
|
9
|
+
|
|
10
|
+
def accepts?(_io, stream_info)
|
|
11
|
+
return true if matches?(stream_info, extensions: EXTENSIONS, mimetypes: MIMETYPES)
|
|
12
|
+
|
|
13
|
+
mime = stream_info.guessed_mimetype
|
|
14
|
+
!mime.nil? && mime.start_with?("text/")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def convert(io, stream_info, **_options)
|
|
18
|
+
Result.new(markdown: read_all(io, stream_info).strip)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
module Converters
|
|
5
|
+
# Converts a PowerPoint .pptx deck into Markdown: a `## Slide N` heading per
|
|
6
|
+
# slide followed by its text (one line per paragraph).
|
|
7
|
+
class Pptx < Base
|
|
8
|
+
SLIDE_PATTERN = %r{\Appt/slides/slide(\d+)\.xml\z}.freeze
|
|
9
|
+
|
|
10
|
+
def accepts?(_io, stream_info)
|
|
11
|
+
matches?(
|
|
12
|
+
stream_info,
|
|
13
|
+
extensions: %w[pptx],
|
|
14
|
+
mimetypes: %w[application/vnd.openxmlformats-officedocument.presentationml.presentation]
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def convert(io, _stream_info, **_options)
|
|
19
|
+
Markdownator.require_optional("zip", feature: "PPTX conversion")
|
|
20
|
+
Markdownator.require_optional("nokogiri", feature: "PPTX conversion")
|
|
21
|
+
|
|
22
|
+
sections = []
|
|
23
|
+
::Zip::File.open_buffer(io) do |zip|
|
|
24
|
+
slides = zip.entries.select { |e| e.name.match?(SLIDE_PATTERN) }
|
|
25
|
+
slides.sort_by! { |e| e.name[SLIDE_PATTERN, 1].to_i }
|
|
26
|
+
slides.each_with_index do |entry, index|
|
|
27
|
+
sections << render_slide(entry.get_input_stream.read, index + 1)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
Result.new(markdown: sections.join("\n\n"))
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def render_slide(xml, number)
|
|
36
|
+
doc = Nokogiri::XML(xml)
|
|
37
|
+
doc.remove_namespaces!
|
|
38
|
+
lines = doc.xpath("//p").map do |para|
|
|
39
|
+
para.xpath(".//t").map(&:text).join.strip
|
|
40
|
+
end
|
|
41
|
+
lines.reject!(&:empty?)
|
|
42
|
+
body = lines.empty? ? "" : "\n\n#{lines.join("\n")}"
|
|
43
|
+
"## Slide #{number}#{body}"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "tempfile"
|
|
4
|
+
|
|
5
|
+
module Markdownator
|
|
6
|
+
module Converters
|
|
7
|
+
# Converts an Excel .xlsx workbook into Markdown: one `## SheetName` heading
|
|
8
|
+
# and a Markdown table per sheet, using the `roo` gem.
|
|
9
|
+
class Xlsx < Base
|
|
10
|
+
def accepts?(_io, stream_info)
|
|
11
|
+
matches?(
|
|
12
|
+
stream_info,
|
|
13
|
+
extensions: %w[xlsx],
|
|
14
|
+
mimetypes: %w[application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def convert(io, _stream_info, **_options)
|
|
19
|
+
Markdownator.require_optional("roo", feature: "XLSX conversion")
|
|
20
|
+
|
|
21
|
+
with_tempfile(io) do |path|
|
|
22
|
+
workbook = Roo::Excelx.new(path)
|
|
23
|
+
sections = workbook.sheets.map { |name| render_sheet(workbook, name) }
|
|
24
|
+
Result.new(markdown: sections.compact.join("\n\n"))
|
|
25
|
+
end
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
raise FileConversionError, "Could not read XLSX: #{e.message}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def render_sheet(workbook, name)
|
|
33
|
+
sheet = workbook.sheet(name)
|
|
34
|
+
rows = (1..sheet.last_row.to_i).map do |r|
|
|
35
|
+
(1..sheet.last_column.to_i).map { |c| sheet.cell(r, c) }
|
|
36
|
+
end
|
|
37
|
+
table = markdown_table(rows)
|
|
38
|
+
table.empty? ? nil : "## #{name}\n\n#{table}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def with_tempfile(io)
|
|
42
|
+
Tempfile.create(["markdownator", ".xlsx"]) do |file|
|
|
43
|
+
file.binmode
|
|
44
|
+
file.write(io.read)
|
|
45
|
+
file.flush
|
|
46
|
+
yield file.path
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
module Converters
|
|
5
|
+
# Converts XML into an indented Markdown outline of elements and their text.
|
|
6
|
+
class Xml < Base
|
|
7
|
+
def accepts?(_io, stream_info)
|
|
8
|
+
matches?(stream_info, extensions: %w[xml], mimetypes: %w[application/xml text/xml])
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def convert(io, stream_info, **_options)
|
|
12
|
+
Markdownator.require_optional("nokogiri", feature: "XML conversion")
|
|
13
|
+
doc = Nokogiri::XML(read_all(io, stream_info))
|
|
14
|
+
raise FileConversionError, "Could not parse XML" if doc.root.nil?
|
|
15
|
+
|
|
16
|
+
lines = []
|
|
17
|
+
walk(doc.root, 0, lines)
|
|
18
|
+
Result.new(markdown: lines.join("\n"))
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def walk(node, depth, lines)
|
|
24
|
+
indent = " " * depth
|
|
25
|
+
children = node.element_children
|
|
26
|
+
own_text = node.xpath("./text()").map(&:text).join(" ").gsub(/\s+/, " ").strip
|
|
27
|
+
|
|
28
|
+
label = node.name
|
|
29
|
+
label = "#{label}: #{own_text}" unless own_text.empty?
|
|
30
|
+
lines << "#{indent}- #{label}"
|
|
31
|
+
|
|
32
|
+
children.each { |child| walk(child, depth + 1, lines) }
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "stringio"
|
|
4
|
+
|
|
5
|
+
module Markdownator
|
|
6
|
+
module Converters
|
|
7
|
+
# Converts a ZIP archive by recursing each contained file back through the
|
|
8
|
+
# engine and concatenating the results under per-file headings.
|
|
9
|
+
class Zip < Base
|
|
10
|
+
def accepts?(io, stream_info)
|
|
11
|
+
return true if matches?(stream_info, extensions: %w[zip], mimetypes: %w[application/zip])
|
|
12
|
+
|
|
13
|
+
magic_zip?(io)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def convert(io, _stream_info, **options)
|
|
17
|
+
Markdownator.require_optional("zip", feature: "ZIP conversion")
|
|
18
|
+
engine = options[:engine] || Engine.new
|
|
19
|
+
|
|
20
|
+
sections = []
|
|
21
|
+
::Zip::File.open_buffer(io) do |zip|
|
|
22
|
+
zip.entries.sort_by(&:name).each do |entry|
|
|
23
|
+
next if entry.directory?
|
|
24
|
+
|
|
25
|
+
section = convert_entry(engine, entry, options)
|
|
26
|
+
sections << section unless section.nil?
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
Result.new(markdown: sections.join("\n\n"))
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def convert_entry(engine, entry, options)
|
|
35
|
+
stream_info = StreamInfo.new(
|
|
36
|
+
extension: File.extname(entry.name),
|
|
37
|
+
filename: File.basename(entry.name)
|
|
38
|
+
)
|
|
39
|
+
result = engine.convert_stream(
|
|
40
|
+
StringIO.new(entry.get_input_stream.read),
|
|
41
|
+
stream_info,
|
|
42
|
+
**options.reject { |k, _| k == :engine }
|
|
43
|
+
)
|
|
44
|
+
body = result.markdown.strip
|
|
45
|
+
body.empty? ? nil : "## #{entry.name}\n\n#{body}"
|
|
46
|
+
rescue UnsupportedFormatError, FileConversionError
|
|
47
|
+
nil
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def magic_zip?(io)
|
|
51
|
+
io.rewind if io.respond_to?(:rewind)
|
|
52
|
+
io.read(4) == "PK\x03\x04"
|
|
53
|
+
ensure
|
|
54
|
+
io.rewind if io.respond_to?(:rewind)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "stringio"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "net/http"
|
|
6
|
+
|
|
7
|
+
module Markdownator
|
|
8
|
+
# Orchestrator that holds an ordered list of converters and dispatches an
|
|
9
|
+
# input (local path, URL, or IO stream) to the first converter that accepts it.
|
|
10
|
+
class Engine
|
|
11
|
+
# Default converters, in priority order. More specific formats (the
|
|
12
|
+
# ZIP-based Office/EPUB containers) must come before the generic ZIP
|
|
13
|
+
# converter, and the plain-text fallback comes last.
|
|
14
|
+
DEFAULT_CONVERTER_ORDER = %i[
|
|
15
|
+
docx xlsx pptx epub zip pdf image html csv json xml plain_text
|
|
16
|
+
].freeze
|
|
17
|
+
|
|
18
|
+
attr_reader :converters
|
|
19
|
+
|
|
20
|
+
# @param converters [Array<Converters::Base>] custom converter chain.
|
|
21
|
+
# @param options [Hash] default options threaded into every conversion
|
|
22
|
+
# (e.g. `captioner:`).
|
|
23
|
+
def initialize(converters: nil, **options)
|
|
24
|
+
@converters = converters || self.class.default_converters
|
|
25
|
+
@default_options = options
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.default_converters
|
|
29
|
+
DEFAULT_CONVERTER_ORDER.map do |name|
|
|
30
|
+
Converters.const_get(camelize(name)).new
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.camelize(name)
|
|
35
|
+
name.to_s.split("_").map(&:capitalize).join
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Permissive entry point: dispatches based on what `source` looks like.
|
|
39
|
+
def convert(source, **options)
|
|
40
|
+
if source.respond_to?(:read)
|
|
41
|
+
convert_stream(source, options.delete(:stream_info), **options)
|
|
42
|
+
elsif url?(source)
|
|
43
|
+
convert_url(source, **options)
|
|
44
|
+
else
|
|
45
|
+
convert_local(source, **options)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Converts a local file path.
|
|
50
|
+
def convert_local(path, **options)
|
|
51
|
+
raise FileConversionError, "No such file: #{path}" unless File.file?(path)
|
|
52
|
+
|
|
53
|
+
stream_info = StreamInfo.new(
|
|
54
|
+
extension: File.extname(path),
|
|
55
|
+
filename: File.basename(path),
|
|
56
|
+
local_path: path
|
|
57
|
+
)
|
|
58
|
+
File.open(path, "rb") do |io|
|
|
59
|
+
convert_stream(io, stream_info, **options)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Fetches an http(s) URL and converts the response body.
|
|
64
|
+
def convert_url(url, **options)
|
|
65
|
+
uri = URI.parse(url)
|
|
66
|
+
response = Net::HTTP.get_response(uri)
|
|
67
|
+
raise FileConversionError, "HTTP #{response.code} fetching #{url}" unless response.is_a?(Net::HTTPSuccess)
|
|
68
|
+
|
|
69
|
+
stream_info = StreamInfo.new(
|
|
70
|
+
mimetype: response.content_type,
|
|
71
|
+
extension: File.extname(uri.path.to_s),
|
|
72
|
+
charset: response.type_params["charset"],
|
|
73
|
+
filename: File.basename(uri.path.to_s),
|
|
74
|
+
url: url
|
|
75
|
+
)
|
|
76
|
+
convert_stream(StringIO.new(response.body), stream_info, **options)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Converts an open IO stream. `stream_info` provides format hints.
|
|
80
|
+
def convert_stream(io, stream_info = nil, **options)
|
|
81
|
+
stream_info ||= StreamInfo.new
|
|
82
|
+
opts = @default_options.merge(options)
|
|
83
|
+
opts[:engine] = self
|
|
84
|
+
|
|
85
|
+
converter = pick_converter(io, stream_info)
|
|
86
|
+
raise UnsupportedFormatError, describe_unsupported(stream_info) if converter.nil?
|
|
87
|
+
|
|
88
|
+
io.rewind if io.respond_to?(:rewind)
|
|
89
|
+
converter.convert(io, stream_info, **opts)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
def pick_converter(io, stream_info)
|
|
95
|
+
converters.find do |converter|
|
|
96
|
+
io.rewind if io.respond_to?(:rewind)
|
|
97
|
+
converter.accepts?(io, stream_info)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def url?(source)
|
|
102
|
+
source.is_a?(String) && source.match?(%r{\Ahttps?://}i)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def describe_unsupported(stream_info)
|
|
106
|
+
hint = stream_info.filename || stream_info.url || stream_info.guessed_mimetype || "the given stream"
|
|
107
|
+
"No converter accepted #{hint}"
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
# Base error for all Markdownator failures.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when no registered converter accepts the given input.
|
|
8
|
+
class UnsupportedFormatError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when a converter needs an optional gem that is not installed.
|
|
11
|
+
class MissingDependencyError < Error; end
|
|
12
|
+
|
|
13
|
+
# Raised when a converter accepts the input but fails to convert it.
|
|
14
|
+
class FileConversionError < Error; end
|
|
15
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
# The result of a conversion: the produced Markdown plus optional metadata.
|
|
5
|
+
#
|
|
6
|
+
# `#to_s` and `#text_content` both return the Markdown so the result is
|
|
7
|
+
# convenient to print or interpolate.
|
|
8
|
+
class Result
|
|
9
|
+
attr_reader :markdown, :title, :metadata
|
|
10
|
+
|
|
11
|
+
def initialize(markdown:, title: nil, metadata: {})
|
|
12
|
+
@markdown = markdown.to_s
|
|
13
|
+
@title = title
|
|
14
|
+
@metadata = metadata || {}
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_s
|
|
18
|
+
markdown
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
alias text_content markdown
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdownator
|
|
4
|
+
# Immutable value object describing a stream of bytes to be converted.
|
|
5
|
+
#
|
|
6
|
+
# It carries the hints (extension, mimetype, charset, filename, url,
|
|
7
|
+
# local_path) that converters use to decide whether they can handle a given
|
|
8
|
+
# input.
|
|
9
|
+
class StreamInfo
|
|
10
|
+
# Maps a lower-case file extension (without the dot) to a mimetype.
|
|
11
|
+
EXTENSION_TO_MIMETYPE = {
|
|
12
|
+
"txt" => "text/plain",
|
|
13
|
+
"text" => "text/plain",
|
|
14
|
+
"md" => "text/markdown",
|
|
15
|
+
"markdown" => "text/markdown",
|
|
16
|
+
"html" => "text/html",
|
|
17
|
+
"htm" => "text/html",
|
|
18
|
+
"csv" => "text/csv",
|
|
19
|
+
"json" => "application/json",
|
|
20
|
+
"xml" => "application/xml",
|
|
21
|
+
"pdf" => "application/pdf",
|
|
22
|
+
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
23
|
+
"xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
24
|
+
"pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
25
|
+
"epub" => "application/epub+zip",
|
|
26
|
+
"zip" => "application/zip",
|
|
27
|
+
"jpg" => "image/jpeg",
|
|
28
|
+
"jpeg" => "image/jpeg",
|
|
29
|
+
"png" => "image/png",
|
|
30
|
+
"gif" => "image/gif",
|
|
31
|
+
"tif" => "image/tiff",
|
|
32
|
+
"tiff" => "image/tiff"
|
|
33
|
+
}.freeze
|
|
34
|
+
|
|
35
|
+
attr_reader :mimetype, :extension, :charset, :filename, :url, :local_path
|
|
36
|
+
|
|
37
|
+
def initialize(mimetype: nil, extension: nil, charset: nil, filename: nil, url: nil, local_path: nil)
|
|
38
|
+
@mimetype = mimetype
|
|
39
|
+
@extension = normalize_extension(extension)
|
|
40
|
+
@charset = charset
|
|
41
|
+
@filename = filename
|
|
42
|
+
@url = url
|
|
43
|
+
@local_path = local_path
|
|
44
|
+
freeze
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Returns a new StreamInfo with the given attributes overridden, filling in
|
|
48
|
+
# any attribute not provided from the current instance.
|
|
49
|
+
def copy_with(**overrides)
|
|
50
|
+
self.class.new(
|
|
51
|
+
mimetype: overrides.fetch(:mimetype, mimetype),
|
|
52
|
+
extension: overrides.fetch(:extension, extension),
|
|
53
|
+
charset: overrides.fetch(:charset, charset),
|
|
54
|
+
filename: overrides.fetch(:filename, filename),
|
|
55
|
+
url: overrides.fetch(:url, url),
|
|
56
|
+
local_path: overrides.fetch(:local_path, local_path)
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Best-effort mimetype: the explicit one, otherwise derived from extension.
|
|
61
|
+
def guessed_mimetype
|
|
62
|
+
mimetype || EXTENSION_TO_MIMETYPE[extension]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def normalize_extension(ext)
|
|
68
|
+
return nil if ext.nil? || ext.empty?
|
|
69
|
+
|
|
70
|
+
ext.to_s.downcase.delete_prefix(".")
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
data/lib/markdownator.rb
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "markdownator/version"
|
|
4
|
+
require_relative "markdownator/errors"
|
|
5
|
+
require_relative "markdownator/stream_info"
|
|
6
|
+
require_relative "markdownator/result"
|
|
7
|
+
require_relative "markdownator/converters/base"
|
|
8
|
+
require_relative "markdownator/converters/plain_text"
|
|
9
|
+
require_relative "markdownator/converters/html"
|
|
10
|
+
require_relative "markdownator/converters/csv"
|
|
11
|
+
require_relative "markdownator/converters/json"
|
|
12
|
+
require_relative "markdownator/converters/xml"
|
|
13
|
+
require_relative "markdownator/converters/docx"
|
|
14
|
+
require_relative "markdownator/converters/xlsx"
|
|
15
|
+
require_relative "markdownator/converters/pptx"
|
|
16
|
+
require_relative "markdownator/converters/pdf"
|
|
17
|
+
require_relative "markdownator/converters/epub"
|
|
18
|
+
require_relative "markdownator/converters/zip"
|
|
19
|
+
require_relative "markdownator/converters/image"
|
|
20
|
+
require_relative "markdownator/engine"
|
|
21
|
+
|
|
22
|
+
# Convert assorted file formats (Office docs, PDF, HTML, structured data,
|
|
23
|
+
# archives, images) into LLM-friendly Markdown.
|
|
24
|
+
module Markdownator
|
|
25
|
+
class << self
|
|
26
|
+
# Convert a local path, http(s) URL, or open IO stream to Markdown.
|
|
27
|
+
# @return [Markdownator::Result]
|
|
28
|
+
def convert(source, **options)
|
|
29
|
+
default_engine.convert(source, **options)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Convert a local file path to Markdown.
|
|
33
|
+
def convert_local(path, **options)
|
|
34
|
+
default_engine.convert_local(path, **options)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Convert an open IO stream to Markdown. `stream_info` supplies format hints.
|
|
38
|
+
def convert_stream(io, stream_info = nil, **options)
|
|
39
|
+
default_engine.convert_stream(io, stream_info, **options)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Lazily require an optional gem, raising a helpful error if it is missing.
|
|
43
|
+
def require_optional(gem_name, feature:)
|
|
44
|
+
require gem_name
|
|
45
|
+
rescue LoadError
|
|
46
|
+
raise MissingDependencyError,
|
|
47
|
+
"#{feature} requires the '#{gem_name}' gem. Add it to your Gemfile: gem \"#{gem_name}\""
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def default_engine
|
|
53
|
+
@default_engine ||= Engine.new
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/markdownator/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "markdownator"
|
|
7
|
+
spec.version = Markdownator::VERSION
|
|
8
|
+
spec.authors = ["alexrupom"]
|
|
9
|
+
spec.email = ["alexrupom@hotmail.com"]
|
|
10
|
+
|
|
11
|
+
spec.summary = "Convert files (Office docs, PDF, HTML, data, archives, images) to LLM-friendly Markdown."
|
|
12
|
+
spec.description = "Markdownator converts PDF, Word, Excel, PowerPoint, EPUB, HTML, CSV, JSON, XML, " \
|
|
13
|
+
"ZIP archives and images into clean Markdown suitable for large language models, " \
|
|
14
|
+
"using a pluggable converter architecture."
|
|
15
|
+
spec.homepage = "https://github.com/alexrupom/markdownator"
|
|
16
|
+
spec.license = "MIT"
|
|
17
|
+
spec.required_ruby_version = ">= 2.7.0"
|
|
18
|
+
|
|
19
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
20
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
21
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
22
|
+
spec.metadata["rubygems_mfa_required"] = "true"
|
|
23
|
+
|
|
24
|
+
# Specify which files should be added to the gem when it is released.
|
|
25
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
26
|
+
spec.files = Dir.chdir(__dir__) do
|
|
27
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
28
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
spec.bindir = "exe"
|
|
32
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
33
|
+
spec.require_paths = ["lib"]
|
|
34
|
+
|
|
35
|
+
# Heavy format gems are intentionally NOT runtime dependencies. Each converter
|
|
36
|
+
# requires its gem lazily and raises a helpful error if it is missing, so apps
|
|
37
|
+
# install only what they need. The gems used to exercise every format in the
|
|
38
|
+
# test suite are declared as development dependencies in the Gemfile.
|
|
39
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
module Markdownator
|
|
2
|
+
VERSION: String
|
|
3
|
+
|
|
4
|
+
def self.convert: (untyped source, **untyped options) -> Result
|
|
5
|
+
def self.convert_local: (String path, **untyped options) -> Result
|
|
6
|
+
def self.convert_stream: (untyped io, ?StreamInfo? stream_info, **untyped options) -> Result
|
|
7
|
+
def self.require_optional: (String gem_name, feature: String) -> void
|
|
8
|
+
|
|
9
|
+
class Error < StandardError
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class UnsupportedFormatError < Error
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
class MissingDependencyError < Error
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class FileConversionError < Error
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class StreamInfo
|
|
22
|
+
attr_reader mimetype: String?
|
|
23
|
+
attr_reader extension: String?
|
|
24
|
+
attr_reader charset: String?
|
|
25
|
+
attr_reader filename: String?
|
|
26
|
+
attr_reader url: String?
|
|
27
|
+
attr_reader local_path: String?
|
|
28
|
+
|
|
29
|
+
def guessed_mimetype: () -> String?
|
|
30
|
+
def copy_with: (**untyped overrides) -> StreamInfo
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
class Result
|
|
34
|
+
attr_reader markdown: String
|
|
35
|
+
attr_reader title: String?
|
|
36
|
+
attr_reader metadata: Hash[untyped, untyped]
|
|
37
|
+
|
|
38
|
+
def to_s: () -> String
|
|
39
|
+
def text_content: () -> String
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
class Engine
|
|
43
|
+
attr_reader converters: Array[untyped]
|
|
44
|
+
|
|
45
|
+
def convert: (untyped source, **untyped options) -> Result
|
|
46
|
+
def convert_local: (String path, **untyped options) -> Result
|
|
47
|
+
def convert_url: (String url, **untyped options) -> Result
|
|
48
|
+
def convert_stream: (untyped io, ?StreamInfo? stream_info, **untyped options) -> Result
|
|
49
|
+
end
|
|
50
|
+
end
|