markdownator 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ module Converters
5
+ # Extracts text from a PDF (one block per page) using the `pdf-reader` gem.
6
+ class Pdf < Base
7
+ def accepts?(io, stream_info)
8
+ return true if matches?(stream_info, extensions: %w[pdf], mimetypes: %w[application/pdf])
9
+
10
+ magic_pdf?(io)
11
+ end
12
+
13
+ def convert(io, _stream_info, **_options)
14
+ Markdownator.require_optional("pdf-reader", feature: "PDF conversion")
15
+
16
+ reader = PDF::Reader.new(io)
17
+ pages = reader.pages.map { |page| page.text.strip }
18
+ pages.reject!(&:empty?)
19
+ Result.new(
20
+ markdown: pages.join("\n\n---\n\n"),
21
+ metadata: { page_count: reader.page_count }
22
+ )
23
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
24
+ raise FileConversionError, "Could not read PDF: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def magic_pdf?(io)
30
+ io.rewind if io.respond_to?(:rewind)
31
+ io.read(5) == "%PDF-"
32
+ ensure
33
+ io.rewind if io.respond_to?(:rewind)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ module Converters
5
+ # Passes plain text (and Markdown) through unchanged.
6
+ class PlainText < Base
7
+ EXTENSIONS = %w[txt text md markdown].freeze
8
+ MIMETYPES = %w[text/plain text/markdown].freeze
9
+
10
+ def accepts?(_io, stream_info)
11
+ return true if matches?(stream_info, extensions: EXTENSIONS, mimetypes: MIMETYPES)
12
+
13
+ mime = stream_info.guessed_mimetype
14
+ !mime.nil? && mime.start_with?("text/")
15
+ end
16
+
17
+ def convert(io, stream_info, **_options)
18
+ Result.new(markdown: read_all(io, stream_info).strip)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ module Converters
5
+ # Converts a PowerPoint .pptx deck into Markdown: a `## Slide N` heading per
6
+ # slide followed by its text (one line per paragraph).
7
+ class Pptx < Base
8
+ SLIDE_PATTERN = %r{\Appt/slides/slide(\d+)\.xml\z}.freeze
9
+
10
+ def accepts?(_io, stream_info)
11
+ matches?(
12
+ stream_info,
13
+ extensions: %w[pptx],
14
+ mimetypes: %w[application/vnd.openxmlformats-officedocument.presentationml.presentation]
15
+ )
16
+ end
17
+
18
+ def convert(io, _stream_info, **_options)
19
+ Markdownator.require_optional("zip", feature: "PPTX conversion")
20
+ Markdownator.require_optional("nokogiri", feature: "PPTX conversion")
21
+
22
+ sections = []
23
+ ::Zip::File.open_buffer(io) do |zip|
24
+ slides = zip.entries.select { |e| e.name.match?(SLIDE_PATTERN) }
25
+ slides.sort_by! { |e| e.name[SLIDE_PATTERN, 1].to_i }
26
+ slides.each_with_index do |entry, index|
27
+ sections << render_slide(entry.get_input_stream.read, index + 1)
28
+ end
29
+ end
30
+ Result.new(markdown: sections.join("\n\n"))
31
+ end
32
+
33
+ private
34
+
35
+ def render_slide(xml, number)
36
+ doc = Nokogiri::XML(xml)
37
+ doc.remove_namespaces!
38
+ lines = doc.xpath("//p").map do |para|
39
+ para.xpath(".//t").map(&:text).join.strip
40
+ end
41
+ lines.reject!(&:empty?)
42
+ body = lines.empty? ? "" : "\n\n#{lines.join("\n")}"
43
+ "## Slide #{number}#{body}"
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "tempfile"
4
+
5
+ module Markdownator
6
+ module Converters
7
+ # Converts an Excel .xlsx workbook into Markdown: one `## SheetName` heading
8
+ # and a Markdown table per sheet, using the `roo` gem.
9
+ class Xlsx < Base
10
+ def accepts?(_io, stream_info)
11
+ matches?(
12
+ stream_info,
13
+ extensions: %w[xlsx],
14
+ mimetypes: %w[application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
15
+ )
16
+ end
17
+
18
+ def convert(io, _stream_info, **_options)
19
+ Markdownator.require_optional("roo", feature: "XLSX conversion")
20
+
21
+ with_tempfile(io) do |path|
22
+ workbook = Roo::Excelx.new(path)
23
+ sections = workbook.sheets.map { |name| render_sheet(workbook, name) }
24
+ Result.new(markdown: sections.compact.join("\n\n"))
25
+ end
26
+ rescue StandardError => e
27
+ raise FileConversionError, "Could not read XLSX: #{e.message}"
28
+ end
29
+
30
+ private
31
+
32
+ def render_sheet(workbook, name)
33
+ sheet = workbook.sheet(name)
34
+ rows = (1..sheet.last_row.to_i).map do |r|
35
+ (1..sheet.last_column.to_i).map { |c| sheet.cell(r, c) }
36
+ end
37
+ table = markdown_table(rows)
38
+ table.empty? ? nil : "## #{name}\n\n#{table}"
39
+ end
40
+
41
+ def with_tempfile(io)
42
+ Tempfile.create(["markdownator", ".xlsx"]) do |file|
43
+ file.binmode
44
+ file.write(io.read)
45
+ file.flush
46
+ yield file.path
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ module Converters
5
+ # Converts XML into an indented Markdown outline of elements and their text.
6
+ class Xml < Base
7
+ def accepts?(_io, stream_info)
8
+ matches?(stream_info, extensions: %w[xml], mimetypes: %w[application/xml text/xml])
9
+ end
10
+
11
+ def convert(io, stream_info, **_options)
12
+ Markdownator.require_optional("nokogiri", feature: "XML conversion")
13
+ doc = Nokogiri::XML(read_all(io, stream_info))
14
+ raise FileConversionError, "Could not parse XML" if doc.root.nil?
15
+
16
+ lines = []
17
+ walk(doc.root, 0, lines)
18
+ Result.new(markdown: lines.join("\n"))
19
+ end
20
+
21
+ private
22
+
23
+ def walk(node, depth, lines)
24
+ indent = " " * depth
25
+ children = node.element_children
26
+ own_text = node.xpath("./text()").map(&:text).join(" ").gsub(/\s+/, " ").strip
27
+
28
+ label = node.name
29
+ label = "#{label}: #{own_text}" unless own_text.empty?
30
+ lines << "#{indent}- #{label}"
31
+
32
+ children.each { |child| walk(child, depth + 1, lines) }
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ module Markdownator
6
+ module Converters
7
+ # Converts a ZIP archive by recursing each contained file back through the
8
+ # engine and concatenating the results under per-file headings.
9
+ class Zip < Base
10
+ def accepts?(io, stream_info)
11
+ return true if matches?(stream_info, extensions: %w[zip], mimetypes: %w[application/zip])
12
+
13
+ magic_zip?(io)
14
+ end
15
+
16
+ def convert(io, _stream_info, **options)
17
+ Markdownator.require_optional("zip", feature: "ZIP conversion")
18
+ engine = options[:engine] || Engine.new
19
+
20
+ sections = []
21
+ ::Zip::File.open_buffer(io) do |zip|
22
+ zip.entries.sort_by(&:name).each do |entry|
23
+ next if entry.directory?
24
+
25
+ section = convert_entry(engine, entry, options)
26
+ sections << section unless section.nil?
27
+ end
28
+ end
29
+ Result.new(markdown: sections.join("\n\n"))
30
+ end
31
+
32
+ private
33
+
34
+ def convert_entry(engine, entry, options)
35
+ stream_info = StreamInfo.new(
36
+ extension: File.extname(entry.name),
37
+ filename: File.basename(entry.name)
38
+ )
39
+ result = engine.convert_stream(
40
+ StringIO.new(entry.get_input_stream.read),
41
+ stream_info,
42
+ **options.reject { |k, _| k == :engine }
43
+ )
44
+ body = result.markdown.strip
45
+ body.empty? ? nil : "## #{entry.name}\n\n#{body}"
46
+ rescue UnsupportedFormatError, FileConversionError
47
+ nil
48
+ end
49
+
50
+ def magic_zip?(io)
51
+ io.rewind if io.respond_to?(:rewind)
52
+ io.read(4) == "PK\x03\x04"
53
+ ensure
54
+ io.rewind if io.respond_to?(:rewind)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+ require "uri"
5
+ require "net/http"
6
+
7
+ module Markdownator
8
+ # Orchestrator that holds an ordered list of converters and dispatches an
9
+ # input (local path, URL, or IO stream) to the first converter that accepts it.
10
+ class Engine
11
+ # Default converters, in priority order. More specific formats (the
12
+ # ZIP-based Office/EPUB containers) must come before the generic ZIP
13
+ # converter, and the plain-text fallback comes last.
14
+ DEFAULT_CONVERTER_ORDER = %i[
15
+ docx xlsx pptx epub zip pdf image html csv json xml plain_text
16
+ ].freeze
17
+
18
+ attr_reader :converters
19
+
20
+ # @param converters [Array<Converters::Base>] custom converter chain.
21
+ # @param options [Hash] default options threaded into every conversion
22
+ # (e.g. `captioner:`).
23
+ def initialize(converters: nil, **options)
24
+ @converters = converters || self.class.default_converters
25
+ @default_options = options
26
+ end
27
+
28
+ def self.default_converters
29
+ DEFAULT_CONVERTER_ORDER.map do |name|
30
+ Converters.const_get(camelize(name)).new
31
+ end
32
+ end
33
+
34
+ def self.camelize(name)
35
+ name.to_s.split("_").map(&:capitalize).join
36
+ end
37
+
38
+ # Permissive entry point: dispatches based on what `source` looks like.
39
+ def convert(source, **options)
40
+ if source.respond_to?(:read)
41
+ convert_stream(source, options.delete(:stream_info), **options)
42
+ elsif url?(source)
43
+ convert_url(source, **options)
44
+ else
45
+ convert_local(source, **options)
46
+ end
47
+ end
48
+
49
+ # Converts a local file path.
50
+ def convert_local(path, **options)
51
+ raise FileConversionError, "No such file: #{path}" unless File.file?(path)
52
+
53
+ stream_info = StreamInfo.new(
54
+ extension: File.extname(path),
55
+ filename: File.basename(path),
56
+ local_path: path
57
+ )
58
+ File.open(path, "rb") do |io|
59
+ convert_stream(io, stream_info, **options)
60
+ end
61
+ end
62
+
63
+ # Fetches an http(s) URL and converts the response body.
64
+ def convert_url(url, **options)
65
+ uri = URI.parse(url)
66
+ response = Net::HTTP.get_response(uri)
67
+ raise FileConversionError, "HTTP #{response.code} fetching #{url}" unless response.is_a?(Net::HTTPSuccess)
68
+
69
+ stream_info = StreamInfo.new(
70
+ mimetype: response.content_type,
71
+ extension: File.extname(uri.path.to_s),
72
+ charset: response.type_params["charset"],
73
+ filename: File.basename(uri.path.to_s),
74
+ url: url
75
+ )
76
+ convert_stream(StringIO.new(response.body), stream_info, **options)
77
+ end
78
+
79
+ # Converts an open IO stream. `stream_info` provides format hints.
80
+ def convert_stream(io, stream_info = nil, **options)
81
+ stream_info ||= StreamInfo.new
82
+ opts = @default_options.merge(options)
83
+ opts[:engine] = self
84
+
85
+ converter = pick_converter(io, stream_info)
86
+ raise UnsupportedFormatError, describe_unsupported(stream_info) if converter.nil?
87
+
88
+ io.rewind if io.respond_to?(:rewind)
89
+ converter.convert(io, stream_info, **opts)
90
+ end
91
+
92
+ private
93
+
94
+ def pick_converter(io, stream_info)
95
+ converters.find do |converter|
96
+ io.rewind if io.respond_to?(:rewind)
97
+ converter.accepts?(io, stream_info)
98
+ end
99
+ end
100
+
101
+ def url?(source)
102
+ source.is_a?(String) && source.match?(%r{\Ahttps?://}i)
103
+ end
104
+
105
+ def describe_unsupported(stream_info)
106
+ hint = stream_info.filename || stream_info.url || stream_info.guessed_mimetype || "the given stream"
107
+ "No converter accepted #{hint}"
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ # Base error for all Markdownator failures.
5
+ class Error < StandardError; end
6
+
7
+ # Raised when no registered converter accepts the given input.
8
+ class UnsupportedFormatError < Error; end
9
+
10
+ # Raised when a converter needs an optional gem that is not installed.
11
+ class MissingDependencyError < Error; end
12
+
13
+ # Raised when a converter accepts the input but fails to convert it.
14
+ class FileConversionError < Error; end
15
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ # The result of a conversion: the produced Markdown plus optional metadata.
5
+ #
6
+ # `#to_s` and `#text_content` both return the Markdown so the result is
7
+ # convenient to print or interpolate.
8
+ class Result
9
+ attr_reader :markdown, :title, :metadata
10
+
11
+ def initialize(markdown:, title: nil, metadata: {})
12
+ @markdown = markdown.to_s
13
+ @title = title
14
+ @metadata = metadata || {}
15
+ end
16
+
17
+ def to_s
18
+ markdown
19
+ end
20
+
21
+ alias text_content markdown
22
+ end
23
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ # Immutable value object describing a stream of bytes to be converted.
5
+ #
6
+ # It carries the hints (extension, mimetype, charset, filename, url,
7
+ # local_path) that converters use to decide whether they can handle a given
8
+ # input.
9
+ class StreamInfo
10
+ # Maps a lower-case file extension (without the dot) to a mimetype.
11
+ EXTENSION_TO_MIMETYPE = {
12
+ "txt" => "text/plain",
13
+ "text" => "text/plain",
14
+ "md" => "text/markdown",
15
+ "markdown" => "text/markdown",
16
+ "html" => "text/html",
17
+ "htm" => "text/html",
18
+ "csv" => "text/csv",
19
+ "json" => "application/json",
20
+ "xml" => "application/xml",
21
+ "pdf" => "application/pdf",
22
+ "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
23
+ "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
24
+ "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
25
+ "epub" => "application/epub+zip",
26
+ "zip" => "application/zip",
27
+ "jpg" => "image/jpeg",
28
+ "jpeg" => "image/jpeg",
29
+ "png" => "image/png",
30
+ "gif" => "image/gif",
31
+ "tif" => "image/tiff",
32
+ "tiff" => "image/tiff"
33
+ }.freeze
34
+
35
+ attr_reader :mimetype, :extension, :charset, :filename, :url, :local_path
36
+
37
+ def initialize(mimetype: nil, extension: nil, charset: nil, filename: nil, url: nil, local_path: nil)
38
+ @mimetype = mimetype
39
+ @extension = normalize_extension(extension)
40
+ @charset = charset
41
+ @filename = filename
42
+ @url = url
43
+ @local_path = local_path
44
+ freeze
45
+ end
46
+
47
+ # Returns a new StreamInfo with the given attributes overridden, filling in
48
+ # any attribute not provided from the current instance.
49
+ def copy_with(**overrides)
50
+ self.class.new(
51
+ mimetype: overrides.fetch(:mimetype, mimetype),
52
+ extension: overrides.fetch(:extension, extension),
53
+ charset: overrides.fetch(:charset, charset),
54
+ filename: overrides.fetch(:filename, filename),
55
+ url: overrides.fetch(:url, url),
56
+ local_path: overrides.fetch(:local_path, local_path)
57
+ )
58
+ end
59
+
60
+ # Best-effort mimetype: the explicit one, otherwise derived from extension.
61
+ def guessed_mimetype
62
+ mimetype || EXTENSION_TO_MIMETYPE[extension]
63
+ end
64
+
65
+ private
66
+
67
+ def normalize_extension(ext)
68
+ return nil if ext.nil? || ext.empty?
69
+
70
+ ext.to_s.downcase.delete_prefix(".")
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdownator
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "markdownator/version"
4
+ require_relative "markdownator/errors"
5
+ require_relative "markdownator/stream_info"
6
+ require_relative "markdownator/result"
7
+ require_relative "markdownator/converters/base"
8
+ require_relative "markdownator/converters/plain_text"
9
+ require_relative "markdownator/converters/html"
10
+ require_relative "markdownator/converters/csv"
11
+ require_relative "markdownator/converters/json"
12
+ require_relative "markdownator/converters/xml"
13
+ require_relative "markdownator/converters/docx"
14
+ require_relative "markdownator/converters/xlsx"
15
+ require_relative "markdownator/converters/pptx"
16
+ require_relative "markdownator/converters/pdf"
17
+ require_relative "markdownator/converters/epub"
18
+ require_relative "markdownator/converters/zip"
19
+ require_relative "markdownator/converters/image"
20
+ require_relative "markdownator/engine"
21
+
22
+ # Convert assorted file formats (Office docs, PDF, HTML, structured data,
23
+ # archives, images) into LLM-friendly Markdown.
24
+ module Markdownator
25
+ class << self
26
+ # Convert a local path, http(s) URL, or open IO stream to Markdown.
27
+ # @return [Markdownator::Result]
28
+ def convert(source, **options)
29
+ default_engine.convert(source, **options)
30
+ end
31
+
32
+ # Convert a local file path to Markdown.
33
+ def convert_local(path, **options)
34
+ default_engine.convert_local(path, **options)
35
+ end
36
+
37
+ # Convert an open IO stream to Markdown. `stream_info` supplies format hints.
38
+ def convert_stream(io, stream_info = nil, **options)
39
+ default_engine.convert_stream(io, stream_info, **options)
40
+ end
41
+
42
+ # Lazily require an optional gem, raising a helpful error if it is missing.
43
+ def require_optional(gem_name, feature:)
44
+ require gem_name
45
+ rescue LoadError
46
+ raise MissingDependencyError,
47
+ "#{feature} requires the '#{gem_name}' gem. Add it to your Gemfile: gem \"#{gem_name}\""
48
+ end
49
+
50
+ private
51
+
52
+ def default_engine
53
+ @default_engine ||= Engine.new
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/markdownator/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "markdownator"
7
+ spec.version = Markdownator::VERSION
8
+ spec.authors = ["alexrupom"]
9
+ spec.email = ["alexrupom@hotmail.com"]
10
+
11
+ spec.summary = "Convert files (Office docs, PDF, HTML, data, archives, images) to LLM-friendly Markdown."
12
+ spec.description = "Markdownator converts PDF, Word, Excel, PowerPoint, EPUB, HTML, CSV, JSON, XML, " \
13
+ "ZIP archives and images into clean Markdown suitable for large language models, " \
14
+ "using a pluggable converter architecture."
15
+ spec.homepage = "https://github.com/alexrupom/markdownator"
16
+ spec.license = "MIT"
17
+ spec.required_ruby_version = ">= 2.7.0"
18
+
19
+ spec.metadata["homepage_uri"] = spec.homepage
20
+ spec.metadata["source_code_uri"] = spec.homepage
21
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
22
+ spec.metadata["rubygems_mfa_required"] = "true"
23
+
24
+ # Specify which files should be added to the gem when it is released.
25
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
26
+ spec.files = Dir.chdir(__dir__) do
27
+ `git ls-files -z`.split("\x0").reject do |f|
28
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
29
+ end
30
+ end
31
+ spec.bindir = "exe"
32
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
33
+ spec.require_paths = ["lib"]
34
+
35
+ # Heavy format gems are intentionally NOT runtime dependencies. Each converter
36
+ # requires its gem lazily and raises a helpful error if it is missing, so apps
37
+ # install only what they need. The gems used to exercise every format in the
38
+ # test suite are declared as development dependencies in the Gemfile.
39
+ end
@@ -0,0 +1,50 @@
1
+ module Markdownator
2
+ VERSION: String
3
+
4
+ def self.convert: (untyped source, **untyped options) -> Result
5
+ def self.convert_local: (String path, **untyped options) -> Result
6
+ def self.convert_stream: (untyped io, ?StreamInfo? stream_info, **untyped options) -> Result
7
+ def self.require_optional: (String gem_name, feature: String) -> void
8
+
9
+ class Error < StandardError
10
+ end
11
+
12
+ class UnsupportedFormatError < Error
13
+ end
14
+
15
+ class MissingDependencyError < Error
16
+ end
17
+
18
+ class FileConversionError < Error
19
+ end
20
+
21
+ class StreamInfo
22
+ attr_reader mimetype: String?
23
+ attr_reader extension: String?
24
+ attr_reader charset: String?
25
+ attr_reader filename: String?
26
+ attr_reader url: String?
27
+ attr_reader local_path: String?
28
+
29
+ def guessed_mimetype: () -> String?
30
+ def copy_with: (**untyped overrides) -> StreamInfo
31
+ end
32
+
33
+ class Result
34
+ attr_reader markdown: String
35
+ attr_reader title: String?
36
+ attr_reader metadata: Hash[untyped, untyped]
37
+
38
+ def to_s: () -> String
39
+ def text_content: () -> String
40
+ end
41
+
42
+ class Engine
43
+ attr_reader converters: Array[untyped]
44
+
45
+ def convert: (untyped source, **untyped options) -> Result
46
+ def convert_local: (String path, **untyped options) -> Result
47
+ def convert_url: (String url, **untyped options) -> Result
48
+ def convert_stream: (untyped io, ?StreamInfo? stream_info, **untyped options) -> Result
49
+ end
50
+ end