RubyGems - markdownator - Versions diffs - 0.1.1 → 0.1.2 - Mend

markdownator 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -1
data/Gemfile +0 -1
data/README.md +3 -4
data/lib/markdownator/converters/html.rb +2 -2
data/lib/markdownator/converters/xlsx.rb +87 -20
data/lib/markdownator/{converters → renderers}/html_renderer.rb +1 -1
data/lib/markdownator/version.rb +1 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4799d3266ce18fa6adff8a3264b255a8bcaa2974888e260542adacd709566f2a
-  data.tar.gz: 50f573d19ff4b5407220fe4e8d2b50ebbc481c7d983183e24032ed6a8e4671f4
+  metadata.gz: ef75f03777f577049069b98fee3104ee31f7ad3d4cece96f3cc78052db648f6e
+  data.tar.gz: '047778687f87ca470f627d082e7653c17999c8981f7c579512eed0c8a31cb9fa'
 SHA512:
-  metadata.gz: 71873a123d242b1ff45147fc28eb50300c2dfb82d92dd5939be352dbeb05bbf9ae97c1c7003c5edf606e918176f5fa2dcb53b3f2ef2b0c3e9437a3fdb3faad81
-  data.tar.gz: 2c7aaf6871850fa39e323e32f7da9fe3870e06159fda18e4f04e268231214efd717c88f0cd41d291a07964b705e653cac677459a1921a1258545a68e72fd273e
+  metadata.gz: e3ce45e8c1f63c4f374aa475a27c3a567846423005caafa8999e136531139a36695fd180d806dcd5e49d1a5414b7a372d6a448f3c1bccb49d5868c13ae2e3870
+  data.tar.gz: ef73a6ccac7125b16fd8838a88f56be01c8fa555b39887963e8a79170c34428c21f20ac7fea867af06c6aa3410114ad01619f804ed3f1333669596bd656aed2c

data/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.1.2] - 2026-06-13
+### Changed
+- XLSX conversion now reads the workbook directly with rubyzip and Nokogiri
+  instead of `roo`, so every Office format (DOCX, XLSX, PPTX, EPUB) shares one
+  approach and the `roo` dependency is dropped.
+- Moved the HTML renderer to `Markdownator::Renderers::HtmlRenderer` (it renders
+  Markdown, it does not convert a source file).
 ## [0.1.1] - 2026-06-13
 ### Changed
@@ -29,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   dependencies.
 - Pluggable LLM image-captioner hook (off by default).
-[Unreleased]: https://github.com/alexrupom/markdownator/compare/v0.1.1...HEAD
+[Unreleased]: https://github.com/alexrupom/markdownator/compare/v0.1.2...HEAD
+[0.1.2]: https://github.com/alexrupom/markdownator/compare/v0.1.1...v0.1.2
 [0.1.1]: https://github.com/alexrupom/markdownator/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/alexrupom/markdownator/releases/tag/v0.1.0

data/Gemfile CHANGED Viewed

@@ -17,5 +17,4 @@ gem "rubocop", "~> 1.21"
 gem "exifr", "~> 1.3"
 gem "nokogiri", "~> 1.15"
 gem "pdf-reader", "~> 2.12"
-gem "roo", "~> 2.10"
 gem "rubyzip", "~> 2.3"

data/README.md CHANGED Viewed

@@ -16,7 +16,7 @@ libraries **lazily**, so you only install the gems for the formats you actually
 | HTML | `.html`, `.htm` | `nokogiri` |
 | XML | `.xml` | `nokogiri` |
 | Word | `.docx` | `rubyzip`, `nokogiri` |
-| Excel | `.xlsx` | `roo` |
+| Excel | `.xlsx` | `rubyzip`, `nokogiri` |
 | PowerPoint | `.pptx` | `rubyzip`, `nokogiri` |
 | PDF | `.pdf` | `pdf-reader` |
 | EPUB | `.epub` | `rubyzip`, `nokogiri` |
@@ -36,9 +36,8 @@ Then add the gems for the formats you need, e.g.:
 ```ruby
 gem "pdf-reader"      # PDF
-gem "roo"             # XLSX
-gem "rubyzip"         # DOCX, PPTX, EPUB, ZIP
-gem "nokogiri"        # HTML, XML, DOCX, PPTX, EPUB
+gem "rubyzip"         # DOCX, XLSX, PPTX, EPUB, ZIP
+gem "nokogiri"        # HTML, XML, DOCX, XLSX, PPTX, EPUB
 gem "exifr"           # image EXIF
 ```

data/lib/markdownator/converters/html.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-require_relative "html_renderer"
+require_relative "../renderers/html_renderer"
 module Markdownator
   module Converters
@@ -20,7 +20,7 @@ module Markdownator
         Markdownator.require_optional("nokogiri", feature: "HTML conversion")
         doc = Nokogiri::HTML(html)
         root = doc.at_css("body") || doc.root || doc
-        HtmlRenderer.new.render(root)
+        Renderers::HtmlRenderer.new.render(root)
       end
       def self.extract_title(html)

data/lib/markdownator/converters/xlsx.rb CHANGED Viewed

@@ -1,11 +1,12 @@
 # frozen_string_literal: true
-require "tempfile"
 module Markdownator
   module Converters
     # Converts an Excel .xlsx workbook into Markdown: one `## SheetName` heading
-    # and a Markdown table per sheet, using the `roo` gem.
+    # and a Markdown table per sheet.
+    #
+    # Reads the OOXML zip directly with rubyzip and Nokogiri, the same approach
+    # used by the DOCX, PPTX, and EPUB converters.
     class Xlsx < Base
       def accepts?(_io, stream_info)
         matches?(
@@ -16,12 +17,16 @@ module Markdownator
       end
       def convert(io, _stream_info, **_options)
-        Markdownator.require_optional("roo", feature: "XLSX conversion")
+        Markdownator.require_optional("zip", feature: "XLSX conversion")
+        Markdownator.require_optional("nokogiri", feature: "XLSX conversion")
-        with_tempfile(io) do |path|
-          workbook = Roo::Excelx.new(path)
-          sections = workbook.sheets.map { |name| render_sheet(workbook, name) }
-          Result.new(markdown: sections.compact.join("\n\n"))
+        ::Zip::File.open_buffer(io) do |zip|
+          shared = shared_strings(zip)
+          sections = sheets(zip).filter_map do |name, path|
+            table = markdown_table(parse_sheet(read(zip, path), shared))
+            "## #{name}\n\n#{table}" unless table.empty?
+          end
+          return Result.new(markdown: sections.join("\n\n"))
         end
       rescue StandardError => e
         raise FileConversionError, "Could not read XLSX: #{e.message}"
@@ -29,23 +34,85 @@ module Markdownator
       private
-      def render_sheet(workbook, name)
-        sheet = workbook.sheet(name)
-        rows = (1..sheet.last_row.to_i).map do |r|
-          (1..sheet.last_column.to_i).map { |c| sheet.cell(r, c) }
+      def read(zip, path)
+        zip.find_entry(path)&.get_input_stream&.read
+      end
+      # Ordered [[sheet_name, worksheet_path], ...] resolved via the workbook
+      # relationships.
+      def sheets(zip)
+        workbook = parse(read(zip, "xl/workbook.xml"))
+        return [] if workbook.nil?
+        rels = relationships(zip)
+        workbook.xpath("//sheets/sheet").filter_map do |sheet|
+          path = resolve_target(rels[sheet["id"]])
+          [sheet["name"].to_s, path] if path
         end
-        table = markdown_table(rows)
-        table.empty? ? nil : "## #{name}\n\n#{table}"
       end
-      def with_tempfile(io)
-        Tempfile.create(["markdownator", ".xlsx"]) do |file|
-          file.binmode
-          file.write(io.read)
-          file.flush
-          yield file.path
+      def relationships(zip)
+        doc = parse(read(zip, "xl/_rels/workbook.xml.rels"))
+        return {} if doc.nil?
+        doc.xpath("//Relationship").to_h { |rel| [rel["Id"], rel["Target"]] }
+      end
+      def resolve_target(target)
+        return nil if target.nil? || target.empty?
+        target = target.delete_prefix("/")
+        target.start_with?("xl/") ? target : "xl/#{target}"
+      end
+      # The shared string table: index -> text.
+      def shared_strings(zip)
+        doc = parse(read(zip, "xl/sharedStrings.xml"))
+        return [] if doc.nil?
+        doc.xpath("//si").map { |si| si.xpath(".//t").map(&:text).join }
+      end
+      def parse_sheet(xml, shared)
+        doc = parse(xml)
+        return [] if doc.nil?
+        doc.xpath("//sheetData/row").map do |row|
+          values = {}
+          width = 0
+          row.xpath("./c").each_with_index do |cell, position|
+            column = column_index(cell["r"]) || (position + 1)
+            width = column if column > width
+            values[column] = cell_value(cell, shared)
+          end
+          (1..width).map { |i| values[i] || "" }
+        end
+      end
+      def cell_value(cell, shared)
+        case cell["t"]
+        when "s" then shared[cell.at_xpath("./v")&.text.to_i].to_s
+        when "inlineStr" then cell.xpath("./is//t").map(&:text).join
+        when "b" then cell.at_xpath("./v")&.text == "1" ? "TRUE" : "FALSE"
+        else cell.at_xpath("./v")&.text.to_s
         end
       end
+      # Converts a cell reference like "B2" into a 1-based column index (2).
+      def column_index(ref)
+        letters = ref.to_s[/\A[A-Z]+/i]
+        return nil if letters.nil?
+        letters.upcase.each_char.reduce(0) { |acc, char| (acc * 26) + (char.ord - 64) }
+      end
+      def parse(xml)
+        return nil if xml.nil?
+        doc = Nokogiri::XML(xml)
+        doc.remove_namespaces!
+        doc
+      end
     end
   end
 end

data/lib/markdownator/{converters → renderers}/html_renderer.rb RENAMED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Markdownator
-  module Converters
+  module Renderers
     # Walks a Nokogiri HTML node tree and renders Markdown. A focused,
     # dependency-free replacement for reverse_markdown: HTML conversion needs
     # only Nokogiri (which reverse_markdown depended on anyway).

data/lib/markdownator/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Markdownator
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: markdownator
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - alexrupom
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-06-12 00:00:00.000000000 Z
+date: 2026-06-13 00:00:00.000000000 Z
 dependencies: []
 description: Markdownator converts PDF, Word, Excel, PowerPoint, EPUB, HTML, CSV,
   JSON, XML, ZIP archives and images into clean Markdown suitable for large language
@@ -33,7 +33,6 @@ files:
 - lib/markdownator/converters/docx.rb
 - lib/markdownator/converters/epub.rb
 - lib/markdownator/converters/html.rb
-- lib/markdownator/converters/html_renderer.rb
 - lib/markdownator/converters/image.rb
 - lib/markdownator/converters/json.rb
 - lib/markdownator/converters/pdf.rb
@@ -44,6 +43,7 @@ files:
 - lib/markdownator/converters/zip.rb
 - lib/markdownator/engine.rb
 - lib/markdownator/errors.rb
+- lib/markdownator/renderers/html_renderer.rb
 - lib/markdownator/result.rb
 - lib/markdownator/stream_info.rb
 - lib/markdownator/version.rb