markdownator 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4799d3266ce18fa6adff8a3264b255a8bcaa2974888e260542adacd709566f2a
4
- data.tar.gz: 50f573d19ff4b5407220fe4e8d2b50ebbc481c7d983183e24032ed6a8e4671f4
3
+ metadata.gz: ef75f03777f577049069b98fee3104ee31f7ad3d4cece96f3cc78052db648f6e
4
+ data.tar.gz: '047778687f87ca470f627d082e7653c17999c8981f7c579512eed0c8a31cb9fa'
5
5
  SHA512:
6
- metadata.gz: 71873a123d242b1ff45147fc28eb50300c2dfb82d92dd5939be352dbeb05bbf9ae97c1c7003c5edf606e918176f5fa2dcb53b3f2ef2b0c3e9437a3fdb3faad81
7
- data.tar.gz: 2c7aaf6871850fa39e323e32f7da9fe3870e06159fda18e4f04e268231214efd717c88f0cd41d291a07964b705e653cac677459a1921a1258545a68e72fd273e
6
+ metadata.gz: e3ce45e8c1f63c4f374aa475a27c3a567846423005caafa8999e136531139a36695fd180d806dcd5e49d1a5414b7a372d6a448f3c1bccb49d5868c13ae2e3870
7
+ data.tar.gz: ef73a6ccac7125b16fd8838a88f56be01c8fa555b39887963e8a79170c34428c21f20ac7fea867af06c6aa3410114ad01619f804ed3f1333669596bd656aed2c
data/CHANGELOG.md CHANGED
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.1.2] - 2026-06-13
11
+
12
+ ### Changed
13
+
14
+ - XLSX conversion now reads the workbook directly with rubyzip and Nokogiri
15
+ instead of `roo`, so every Office format (DOCX, XLSX, PPTX, EPUB) shares one
16
+ approach and the `roo` dependency is dropped.
17
+ - Moved the HTML renderer to `Markdownator::Renderers::HtmlRenderer` (it renders
18
+ Markdown, it does not convert a source file).
19
+
10
20
  ## [0.1.1] - 2026-06-13
11
21
 
12
22
  ### Changed
@@ -29,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
29
39
  dependencies.
30
40
  - Pluggable LLM image-captioner hook (off by default).
31
41
 
32
- [Unreleased]: https://github.com/alexrupom/markdownator/compare/v0.1.1...HEAD
42
+ [Unreleased]: https://github.com/alexrupom/markdownator/compare/v0.1.2...HEAD
43
+ [0.1.2]: https://github.com/alexrupom/markdownator/compare/v0.1.1...v0.1.2
33
44
  [0.1.1]: https://github.com/alexrupom/markdownator/compare/v0.1.0...v0.1.1
34
45
  [0.1.0]: https://github.com/alexrupom/markdownator/releases/tag/v0.1.0
data/Gemfile CHANGED
@@ -17,5 +17,4 @@ gem "rubocop", "~> 1.21"
17
17
  gem "exifr", "~> 1.3"
18
18
  gem "nokogiri", "~> 1.15"
19
19
  gem "pdf-reader", "~> 2.12"
20
- gem "roo", "~> 2.10"
21
20
  gem "rubyzip", "~> 2.3"
data/README.md CHANGED
@@ -16,7 +16,7 @@ libraries **lazily**, so you only install the gems for the formats you actually
16
16
  | HTML | `.html`, `.htm` | `nokogiri` |
17
17
  | XML | `.xml` | `nokogiri` |
18
18
  | Word | `.docx` | `rubyzip`, `nokogiri` |
19
- | Excel | `.xlsx` | `roo` |
19
+ | Excel | `.xlsx` | `rubyzip`, `nokogiri` |
20
20
  | PowerPoint | `.pptx` | `rubyzip`, `nokogiri` |
21
21
  | PDF | `.pdf` | `pdf-reader` |
22
22
  | EPUB | `.epub` | `rubyzip`, `nokogiri` |
@@ -36,9 +36,8 @@ Then add the gems for the formats you need, e.g.:
36
36
 
37
37
  ```ruby
38
38
  gem "pdf-reader" # PDF
39
- gem "roo" # XLSX
40
- gem "rubyzip" # DOCX, PPTX, EPUB, ZIP
41
- gem "nokogiri" # HTML, XML, DOCX, PPTX, EPUB
39
+ gem "rubyzip" # DOCX, XLSX, PPTX, EPUB, ZIP
40
+ gem "nokogiri" # HTML, XML, DOCX, XLSX, PPTX, EPUB
42
41
  gem "exifr" # image EXIF
43
42
  ```
44
43
 
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "html_renderer"
3
+ require_relative "../renderers/html_renderer"
4
4
 
5
5
  module Markdownator
6
6
  module Converters
@@ -20,7 +20,7 @@ module Markdownator
20
20
  Markdownator.require_optional("nokogiri", feature: "HTML conversion")
21
21
  doc = Nokogiri::HTML(html)
22
22
  root = doc.at_css("body") || doc.root || doc
23
- HtmlRenderer.new.render(root)
23
+ Renderers::HtmlRenderer.new.render(root)
24
24
  end
25
25
 
26
26
  def self.extract_title(html)
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "tempfile"
4
-
5
3
  module Markdownator
6
4
  module Converters
7
5
  # Converts an Excel .xlsx workbook into Markdown: one `## SheetName` heading
8
- # and a Markdown table per sheet, using the `roo` gem.
6
+ # and a Markdown table per sheet.
7
+ #
8
+ # Reads the OOXML zip directly with rubyzip and Nokogiri, the same approach
9
+ # used by the DOCX, PPTX, and EPUB converters.
9
10
  class Xlsx < Base
10
11
  def accepts?(_io, stream_info)
11
12
  matches?(
@@ -16,12 +17,16 @@ module Markdownator
16
17
  end
17
18
 
18
19
  def convert(io, _stream_info, **_options)
19
- Markdownator.require_optional("roo", feature: "XLSX conversion")
20
+ Markdownator.require_optional("zip", feature: "XLSX conversion")
21
+ Markdownator.require_optional("nokogiri", feature: "XLSX conversion")
20
22
 
21
- with_tempfile(io) do |path|
22
- workbook = Roo::Excelx.new(path)
23
- sections = workbook.sheets.map { |name| render_sheet(workbook, name) }
24
- Result.new(markdown: sections.compact.join("\n\n"))
23
+ ::Zip::File.open_buffer(io) do |zip|
24
+ shared = shared_strings(zip)
25
+ sections = sheets(zip).filter_map do |name, path|
26
+ table = markdown_table(parse_sheet(read(zip, path), shared))
27
+ "## #{name}\n\n#{table}" unless table.empty?
28
+ end
29
+ return Result.new(markdown: sections.join("\n\n"))
25
30
  end
26
31
  rescue StandardError => e
27
32
  raise FileConversionError, "Could not read XLSX: #{e.message}"
@@ -29,23 +34,85 @@ module Markdownator
29
34
 
30
35
  private
31
36
 
32
- def render_sheet(workbook, name)
33
- sheet = workbook.sheet(name)
34
- rows = (1..sheet.last_row.to_i).map do |r|
35
- (1..sheet.last_column.to_i).map { |c| sheet.cell(r, c) }
37
+ def read(zip, path)
38
+ zip.find_entry(path)&.get_input_stream&.read
39
+ end
40
+
41
+ # Ordered [[sheet_name, worksheet_path], ...] resolved via the workbook
42
+ # relationships.
43
+ def sheets(zip)
44
+ workbook = parse(read(zip, "xl/workbook.xml"))
45
+ return [] if workbook.nil?
46
+
47
+ rels = relationships(zip)
48
+ workbook.xpath("//sheets/sheet").filter_map do |sheet|
49
+ path = resolve_target(rels[sheet["id"]])
50
+ [sheet["name"].to_s, path] if path
36
51
  end
37
- table = markdown_table(rows)
38
- table.empty? ? nil : "## #{name}\n\n#{table}"
39
52
  end
40
53
 
41
- def with_tempfile(io)
42
- Tempfile.create(["markdownator", ".xlsx"]) do |file|
43
- file.binmode
44
- file.write(io.read)
45
- file.flush
46
- yield file.path
54
+ def relationships(zip)
55
+ doc = parse(read(zip, "xl/_rels/workbook.xml.rels"))
56
+ return {} if doc.nil?
57
+
58
+ doc.xpath("//Relationship").to_h { |rel| [rel["Id"], rel["Target"]] }
59
+ end
60
+
61
+ def resolve_target(target)
62
+ return nil if target.nil? || target.empty?
63
+
64
+ target = target.delete_prefix("/")
65
+ target.start_with?("xl/") ? target : "xl/#{target}"
66
+ end
67
+
68
+ # The shared string table: index -> text.
69
+ def shared_strings(zip)
70
+ doc = parse(read(zip, "xl/sharedStrings.xml"))
71
+ return [] if doc.nil?
72
+
73
+ doc.xpath("//si").map { |si| si.xpath(".//t").map(&:text).join }
74
+ end
75
+
76
+ def parse_sheet(xml, shared)
77
+ doc = parse(xml)
78
+ return [] if doc.nil?
79
+
80
+ doc.xpath("//sheetData/row").map do |row|
81
+ values = {}
82
+ width = 0
83
+ row.xpath("./c").each_with_index do |cell, position|
84
+ column = column_index(cell["r"]) || (position + 1)
85
+ width = column if column > width
86
+ values[column] = cell_value(cell, shared)
87
+ end
88
+ (1..width).map { |i| values[i] || "" }
89
+ end
90
+ end
91
+
92
+ def cell_value(cell, shared)
93
+ case cell["t"]
94
+ when "s" then shared[cell.at_xpath("./v")&.text.to_i].to_s
95
+ when "inlineStr" then cell.xpath("./is//t").map(&:text).join
96
+ when "b" then cell.at_xpath("./v")&.text == "1" ? "TRUE" : "FALSE"
97
+ else cell.at_xpath("./v")&.text.to_s
47
98
  end
48
99
  end
100
+
101
+ # Converts a cell reference like "B2" into a 1-based column index (2).
102
+ def column_index(ref)
103
+ letters = ref.to_s[/\A[A-Z]+/i]
104
+ return nil if letters.nil?
105
+
106
+ letters.upcase.each_char.reduce(0) { |acc, char| (acc * 26) + (char.ord - 64) }
107
+ end
108
+
109
+ def parse(xml)
110
+ return nil if xml.nil?
111
+
112
+ doc = Nokogiri::XML(xml)
113
+ doc.remove_namespaces!
114
+ doc
115
+ end
49
116
  end
50
117
  end
51
118
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Markdownator
4
- module Converters
4
+ module Renderers
5
5
  # Walks a Nokogiri HTML node tree and renders Markdown. A focused,
6
6
  # dependency-free replacement for reverse_markdown: HTML conversion needs
7
7
  # only Nokogiri (which reverse_markdown depended on anyway).
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Markdownator
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markdownator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - alexrupom
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-06-12 00:00:00.000000000 Z
11
+ date: 2026-06-13 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Markdownator converts PDF, Word, Excel, PowerPoint, EPUB, HTML, CSV,
14
14
  JSON, XML, ZIP archives and images into clean Markdown suitable for large language
@@ -33,7 +33,6 @@ files:
33
33
  - lib/markdownator/converters/docx.rb
34
34
  - lib/markdownator/converters/epub.rb
35
35
  - lib/markdownator/converters/html.rb
36
- - lib/markdownator/converters/html_renderer.rb
37
36
  - lib/markdownator/converters/image.rb
38
37
  - lib/markdownator/converters/json.rb
39
38
  - lib/markdownator/converters/pdf.rb
@@ -44,6 +43,7 @@ files:
44
43
  - lib/markdownator/converters/zip.rb
45
44
  - lib/markdownator/engine.rb
46
45
  - lib/markdownator/errors.rb
46
+ - lib/markdownator/renderers/html_renderer.rb
47
47
  - lib/markdownator/result.rb
48
48
  - lib/markdownator/stream_info.rb
49
49
  - lib/markdownator/version.rb