epub2md 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f4a66cfbc763f53d06686ccce9731cdea76b2d5d7221c84868d8aba1adf2ac2a
4
+ data.tar.gz: 9ecff2c2013f7a719d5314074b939b96864efbbef1b04ab5b11a6a65f533af4c
5
+ SHA512:
6
+ metadata.gz: a442a7a1abdce5712934be4f6a5b22018eac0f1502516d9b834e5e4e12c815059276425a4ac4baa2693d7e30e7ced8a7e9eff21ab47458770c7d38b2de9a3556
7
+ data.tar.gz: d73b7bd071dbe7469a6292c1076f51a24f1c6924cf46e076bafe654be3612f7d870347446806717a6059695bb27b08ca1f118f8e2cd7ca0bd52b2cbc52589848
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ gem 'rubyzip', '~> 2.3'
6
+ gem 'nokogiri', '~> 1.13'
7
+ gem 'reverse_markdown', '~> 2.1'
8
+ gem 'thor', '~> 1.2'
9
+ gem 'httparty', '~> 0.21'
10
+
11
+ group :development, :test do
12
+ gem 'rspec', '~> 3.12'
13
+ gem 'pry', '~> 0.14'
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,78 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ bigdecimal (4.0.1)
5
+ coderay (1.1.3)
6
+ csv (3.3.5)
7
+ diff-lcs (1.6.2)
8
+ httparty (0.24.2)
9
+ csv
10
+ mini_mime (>= 1.0.0)
11
+ multi_xml (>= 0.5.2)
12
+ io-console (0.8.2)
13
+ method_source (1.1.0)
14
+ mini_mime (1.1.5)
15
+ multi_xml (0.8.1)
16
+ bigdecimal (>= 3.1, < 5)
17
+ nokogiri (1.19.0-aarch64-linux-gnu)
18
+ racc (~> 1.4)
19
+ nokogiri (1.19.0-aarch64-linux-musl)
20
+ racc (~> 1.4)
21
+ nokogiri (1.19.0-arm-linux-gnu)
22
+ racc (~> 1.4)
23
+ nokogiri (1.19.0-arm-linux-musl)
24
+ racc (~> 1.4)
25
+ nokogiri (1.19.0-arm64-darwin)
26
+ racc (~> 1.4)
27
+ nokogiri (1.19.0-x86_64-darwin)
28
+ racc (~> 1.4)
29
+ nokogiri (1.19.0-x86_64-linux-gnu)
30
+ racc (~> 1.4)
31
+ nokogiri (1.19.0-x86_64-linux-musl)
32
+ racc (~> 1.4)
33
+ pry (0.16.0)
34
+ coderay (~> 1.1)
35
+ method_source (~> 1.0)
36
+ reline (>= 0.6.0)
37
+ racc (1.8.1)
38
+ reline (0.6.3)
39
+ io-console (~> 0.5)
40
+ reverse_markdown (2.1.1)
41
+ nokogiri
42
+ rspec (3.13.2)
43
+ rspec-core (~> 3.13.0)
44
+ rspec-expectations (~> 3.13.0)
45
+ rspec-mocks (~> 3.13.0)
46
+ rspec-core (3.13.6)
47
+ rspec-support (~> 3.13.0)
48
+ rspec-expectations (3.13.5)
49
+ diff-lcs (>= 1.2.0, < 2.0)
50
+ rspec-support (~> 3.13.0)
51
+ rspec-mocks (3.13.7)
52
+ diff-lcs (>= 1.2.0, < 2.0)
53
+ rspec-support (~> 3.13.0)
54
+ rspec-support (3.13.7)
55
+ rubyzip (2.4.1)
56
+ thor (1.5.0)
57
+
58
+ PLATFORMS
59
+ aarch64-linux-gnu
60
+ aarch64-linux-musl
61
+ arm-linux-gnu
62
+ arm-linux-musl
63
+ arm64-darwin
64
+ x86_64-darwin
65
+ x86_64-linux-gnu
66
+ x86_64-linux-musl
67
+
68
+ DEPENDENCIES
69
+ httparty (~> 0.21)
70
+ nokogiri (~> 1.13)
71
+ pry (~> 0.14)
72
+ reverse_markdown (~> 2.1)
73
+ rspec (~> 3.12)
74
+ rubyzip (~> 2.3)
75
+ thor (~> 1.2)
76
+
77
+ BUNDLED WITH
78
+ 2.7.2
data/README.md ADDED
@@ -0,0 +1,96 @@
1
+ # Ruby EPUB2MD
2
+
3
+ A Ruby implementation of the epub2md tool for converting EPUB files to Markdown format.
4
+
5
+ ## Features
6
+
7
+ - Convert EPUB files to multiple Markdown files
8
+ - Convert EPUB files to a single merged Markdown file
9
+ - Extract EPUB metadata and structure information
10
+ - Localize images (extract images from EPUB to local directory when using `--localize` option)
11
+ - Command-line interface for easy usage
12
+ - Programmatic API for integration into other tools
13
+
14
+ ## Installation
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ ```ruby
19
+ gem 'epub2md'
20
+ ```
21
+
22
+ And then execute:
23
+
24
+ ```bash
25
+ bundle install
26
+ ```
27
+
28
+ Or install it yourself as:
29
+
30
+ ```bash
31
+ gem install epub2md
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ### Command Line Interface
37
+
38
+ ```bash
39
+ # Convert EPUB to multiple Markdown files (text only, no images extracted)
40
+ bundle exec ruby bin/epub2md convert path/to/book.epub
41
+
42
+ # Convert EPUB to a single Markdown file (text only, no images extracted)
43
+ bundle exec ruby bin/epub2md merge path/to/book.epub
44
+
45
+ # Show EPUB information (human-readable format)
46
+ bundle exec ruby bin/epub2md info path/to/book.epub
47
+
48
+ # Show EPUB information (JSON format)
49
+ bundle exec ruby bin/epub2md info path/to/book.epub --json
50
+
51
+ # Show EPUB structure
52
+ bundle exec ruby bin/epub2md structure path/to/book.epub
53
+
54
+ # Show EPUB sections
55
+ bundle exec ruby bin/epub2md sections path/to/book.epub
56
+
57
+ # Localize images during conversion (extract images from EPUB to local directory)
58
+ # This creates an 'images' directory with all images from the EPUB
59
+ bundle exec ruby bin/epub2md convert path/to/book.epub --localize
60
+
61
+ # Specify output directory
62
+ bundle exec ruby bin/epub2md convert path/to/book.epub --output /custom/output/dir
63
+ ```
64
+
65
+ **Note**: Image extraction only occurs when using the `--localize` option. Without this option, only text content is converted to Markdown, and images remain embedded in the EPUB file but are not extracted to the output directory.
66
+
67
+ ### Programmatic Usage
68
+
69
+ ```ruby
70
+ require 'epub2md'
71
+
72
+ # Parse an EPUB file
73
+ parser = Epub2md::Parser.new('path/to/book.epub')
74
+ parser.parse
75
+
76
+ # Convert to multiple markdown files
77
+ converter = Epub2md::Converter.new(parser)
78
+ converter.convert_to_markdown(output_dir: './output')
79
+
80
+ # Or convert to a single markdown file
81
+ converter.convert_to_single_markdown(output_filename: './output/merged_book.md')
82
+ ```
83
+
84
+ ## Development
85
+
86
+ After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rspec` to run the tests.
87
+
88
+ To install this gem onto your local machine, run `bundle exec rake install`.
89
+
90
+ ## Contributing
91
+
92
+ Bug reports and pull requests are welcome
93
+
94
+ ## License
95
+
96
+ The gem is available as open source under the terms of the MIT License.
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
8
+ namespace :epub2md do
9
+ desc "Install dependencies"
10
+ task :install do
11
+ sh "bundle install"
12
+ end
13
+
14
+ desc "Run tests"
15
+ task :test => :spec
16
+
17
+ desc "Build gem"
18
+ task :build do
19
+ sh "gem build epub2md.gemspec"
20
+ end
21
+
22
+ desc "Install gem locally"
23
+ task :install_gem => :build do
24
+ gem_file = Dir.glob("*.gem").first
25
+ sh "gem install #{gem_file}" if gem_file
26
+ end
27
+ end
data/bin/epub2md ADDED
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'thor'
4
+ require_relative '../lib/epub2md'
5
+
6
+ module Epub2md
7
+ # Command-line interface for the EPUB to Markdown converter
8
+ class CLI < Thor
9
+ class_option :verbose, type: :boolean, default: false, desc: 'Enable verbose output'
10
+
11
+ # Convert EPUB file to multiple Markdown files
12
+ desc 'convert PATH', 'Convert EPUB file to multiple Markdown files'
13
+ option :output, aliases: '-o', type: :string, desc: 'Output directory'
14
+ option :localize, type: :boolean, default: false, desc: 'Extract images to local directory'
15
+ def convert(path)
16
+ validate_epub_path(path)
17
+ setup_verbose_logging
18
+
19
+ parser = Parser.new(path)
20
+ parser.parse
21
+
22
+ converter = Converter.new(parser)
23
+ output_dir = options[:output] || File.dirname(path) + "/#{File.basename(path, '.epub')}_markdown"
24
+
25
+ puts "Converting #{path} to Markdown files..." if options[:verbose]
26
+ files = converter.convert_to_markdown(
27
+ localize_images: options[:localize],
28
+ output_dir: output_dir
29
+ )
30
+
31
+ puts "Conversion complete! Created #{files.length} Markdown files in: #{output_dir}"
32
+ files.each { |file| puts " - #{file}" } if options[:verbose]
33
+ end
34
+
35
+ desc 'merge PATH', 'Convert EPUB file to a single Markdown file'
36
+ option :output, aliases: '-o', type: :string, desc: 'Output filename'
37
+ option :localize, type: :boolean, default: false, desc: 'Download and localize remote images'
38
+ def merge(path)
39
+ validate_epub_path(path)
40
+ setup_verbose_logging
41
+
42
+ parser = Parser.new(path)
43
+ parser.parse
44
+
45
+ converter = Converter.new(parser)
46
+ output_file = options[:output] || File.dirname(path) + "/#{File.basename(path, '.epub')}_merged.md"
47
+
48
+ puts "Converting #{path} to single Markdown file..." if options[:verbose]
49
+ result = converter.convert_to_single_markdown(
50
+ localize_images: options[:localize],
51
+ output_filename: output_file
52
+ )
53
+
54
+ puts "Merged conversion complete! Created: #{result}"
55
+ end
56
+
57
+ desc 'info PATH', 'Show basic information about the EPUB file'
58
+ option :json, type: :boolean, default: false, desc: 'Output information in JSON format'
59
+ def info(path)
60
+ validate_epub_path(path)
61
+
62
+ parser = Parser.new(path)
63
+ parser.parse
64
+
65
+ if options[:json]
66
+ # Output in JSON format
67
+ info_data = {
68
+ file: File.basename(path),
69
+ metadata: parser.metadata,
70
+ stats: {
71
+ sections_count: parser.sections.length,
72
+ manifest_items_count: parser.manifest.length
73
+ }
74
+ }
75
+ require 'json'
76
+ puts JSON.pretty_generate(info_data)
77
+ else
78
+ # Output in human-readable format
79
+ puts "EPUB Information for: #{File.basename(path)}"
80
+ puts "Title: #{parser.metadata[:title] || 'Unknown'}"
81
+ puts "Author: #{parser.metadata[:author] || 'Unknown'}"
82
+ puts "Language: #{parser.metadata[:language] || 'Unknown'}"
83
+ puts "Publisher: #{parser.metadata[:publisher] || 'Unknown'}"
84
+ puts "Description: #{parser.metadata[:description] || 'None'}"
85
+ puts "Number of sections: #{parser.sections.length}"
86
+ puts "Number of manifest items: #{parser.manifest.length}"
87
+ end
88
+ end
89
+
90
+ desc 'structure PATH', 'Show the structure of the EPUB file'
91
+ def structure(path)
92
+ validate_epub_path(path)
93
+
94
+ parser = Parser.new(path)
95
+ parser.parse
96
+
97
+ puts "EPUB Structure for: #{File.basename(path)}"
98
+ print_structure(parser.structure)
99
+ end
100
+
101
+ desc 'sections PATH', 'List all sections/chapters in the EPUB file'
102
+ def sections(path)
103
+ validate_epub_path(path)
104
+
105
+ parser = Parser.new(path)
106
+ parser.parse
107
+
108
+ puts "Sections in: #{File.basename(path)}"
109
+ parser.sections.each_with_index do |section, index|
110
+ title = section[:title] || 'Untitled'
111
+ puts "#{index + 1}. #{title} (ID: #{section[:id]})"
112
+ end
113
+ end
114
+
115
+ private
116
+
117
+ def validate_epub_path(path)
118
+ unless File.exist?(path)
119
+ puts "Error: File does not exist: #{path}"
120
+ exit(1)
121
+ end
122
+
123
+ unless path.end_with?('.epub')
124
+ puts "Error: File is not an EPUB: #{path}"
125
+ exit(1)
126
+ end
127
+ end
128
+
129
+ def setup_verbose_logging
130
+ $VERBOSE = true if options[:verbose]
131
+ end
132
+
133
+ def print_structure(items, level = 0)
134
+ indent = " " * level
135
+ items.each do |item|
136
+ puts "#{indent}- #{item[:name] || 'Unnamed'} (#{item[:path] || 'No path'})"
137
+ print_structure(item[:children], level + 1) if item[:children]
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+ if __FILE__ == $0
144
+ Epub2md::CLI.start(ARGV)
145
+ end
data/epub2md.gemspec ADDED
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/epub2md/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'epub2md'
7
+ spec.version = Epub2md::VERSION
8
+ spec.authors = ['lucas.qin']
9
+ spec.email = ['qsc1956826@gmail.com']
10
+
11
+ spec.summary = 'A Ruby gem to convert EPUB files to Markdown format'
12
+ spec.description = 'Convert EPUB files to Markdown with support for images, chapters, and formatting'
13
+ spec.homepage = 'https://github.com/GengCen-Qin/epub2md'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.6.0')
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = spec.homepage
19
+ spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
23
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
+ end
25
+ spec.bindir = 'exe'
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ['lib']
28
+
29
+ # Runtime dependencies
30
+ spec.add_dependency 'rubyzip', '~> 2.3'
31
+ spec.add_dependency 'nokogiri', '~> 1.13'
32
+ spec.add_dependency 'reverse_markdown', '~> 2.1'
33
+ spec.add_dependency 'thor', '~> 1.2'
34
+ spec.add_dependency 'httparty', '~> 0.21'
35
+
36
+ # Development dependencies
37
+ spec.add_development_dependency 'rspec', '~> 3.12'
38
+ spec.add_development_dependency 'pry', '~> 0.14'
39
+ end
@@ -0,0 +1,319 @@
1
+ require 'reverse_markdown'
2
+
3
+ module Epub2md
4
+ # Converts EPUB content to Markdown format
5
+ class Converter
6
+ # Initialize converter with a parser instance
7
+ # @param parser [Parser] EPUB parser instance
8
+ def initialize(parser)
9
+ @parser = parser
10
+ end
11
+
12
+ # Convert EPUB to multiple Markdown files
13
+ # @param localize_images [Boolean] whether to extract images to local directory
14
+ # @param output_dir [String] directory to save Markdown files
15
+ # @return [Array<String>] list of created Markdown file paths
16
+ def convert_to_markdown(localize_images: false, output_dir: nil)
17
+ output_dir ||= File.dirname(@parser.epub_path) + "/#{@parser.epub_path.basename('.epub')}_markdown"
18
+ Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
19
+
20
+ # Create images directory if needed
21
+ images_dir = File.join(output_dir, 'images')
22
+ Dir.mkdir(images_dir) unless Dir.exist?(images_dir)
23
+
24
+ # Store the images directory for use in preprocessing
25
+ @images_dir = images_dir
26
+ @localize = localize_images
27
+
28
+ # Extract all images from the EPUB to the images directory
29
+ extract_images_to_directory(images_dir) if localize_images
30
+
31
+ # Convert each section to markdown
32
+ markdown_files = []
33
+ @parser.sections.each_with_index do |section, index|
34
+ # Format the filename with leading zeros based on total sections
35
+ filename = sprintf("%0#{Math.log10(@parser.sections.length).to_i + 1}d", index + 1)
36
+ filename += "-#{sanitize_filename(section[:title] || section[:id])}.md"
37
+
38
+ markdown_content = convert_html_to_markdown(section[:html_content])
39
+
40
+ # Process image links if localization is enabled
41
+ if localize_images
42
+ markdown_content = process_image_links(markdown_content, images_dir)
43
+ end
44
+
45
+ file_path = File.join(output_dir, filename)
46
+ File.write(file_path, markdown_content, encoding: 'UTF-8')
47
+ markdown_files << file_path
48
+ end
49
+
50
+ # Clear the stored variables
51
+ @images_dir = nil
52
+ @localize = nil
53
+
54
+ markdown_files
55
+ end
56
+
57
+ def extract_images_to_directory(images_dir)
58
+ # Extract all image files from the EPUB manifest to the images directory
59
+ @parser.manifest.each do |id, item|
60
+ href = item[:href]
61
+ media_type = item[:media_type]
62
+
63
+ # Check if the item is an image
64
+ if media_type && media_type.start_with?('image/')
65
+ # Determine the full path in the EPUB
66
+ normalized_path = href.start_with?('/') ? href[1..-1] : File.join(@parser.root_dir, href).gsub(/^\.\//, '')
67
+
68
+ # Find the image in the EPUB archive and extract it
69
+ entry = @parser.zip_file.find_entry(normalized_path)
70
+ if entry
71
+ # Extract the image from the EPUB to the local images directory
72
+ image_path = File.join(images_dir, File.basename(normalized_path))
73
+ File.open(image_path, 'wb') do |f|
74
+ f.write(entry.get_input_stream.read)
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ # Convert EPUB to a single merged Markdown file
82
+ # @param localize_images [Boolean] whether to extract images to local directory
83
+ # @param output_filename [String] path to save merged Markdown file
84
+ # @return [String] path to the created Markdown file
85
+ def convert_to_single_markdown(localize_images: false, output_filename: nil)
86
+ output_filename ||= File.dirname(@parser.epub_path) + "/#{@parser.epub_path.basename('.epub')}_merged.md"
87
+
88
+ # Create images directory if needed
89
+ images_dir = File.join(File.dirname(output_filename), 'images')
90
+ Dir.mkdir(images_dir) unless Dir.exist?(images_dir)
91
+
92
+ # Store the images directory for use in preprocessing
93
+ @images_dir = images_dir
94
+ @localize = localize_images
95
+
96
+ # Extract all images from the EPUB to the images directory
97
+ extract_images_to_directory(images_dir) if localize_images
98
+
99
+ markdown_content = ""
100
+ @parser.sections.each_with_index do |section, index|
101
+ # Add a header for each section
102
+ title = section[:title] || section[:id]
103
+ markdown_content += "# #{title}\n\n"
104
+
105
+ content = convert_html_to_markdown(section[:html_content])
106
+
107
+ # Process image links if localization is enabled
108
+ if localize_images
109
+ content = process_image_links(content, images_dir)
110
+ end
111
+
112
+ markdown_content += content
113
+ markdown_content += "\n\n---\n\n" unless index == @parser.sections.length - 1
114
+ end
115
+
116
+ # Clear the stored variables
117
+ @images_dir = nil
118
+ @localize = nil
119
+
120
+ File.write(output_filename, markdown_content, encoding: 'UTF-8')
121
+ output_filename
122
+ end
123
+
124
+ # Extract all image files from the EPUB manifest to the specified directory
125
+ # @param images_dir [String] directory to save extracted images
126
+ def extract_images_to_directory(images_dir)
127
+ # Extract all image files from the EPUB manifest to the images directory
128
+ @parser.manifest.each do |id, item|
129
+ href = item[:href]
130
+ media_type = item[:media_type]
131
+
132
+ # Check if the item is an image
133
+ if media_type && media_type.start_with?('image/')
134
+ # Determine the full path in the EPUB
135
+ normalized_path = href.start_with?('/') ? href[1..-1] : File.join(@parser.root_dir, href).gsub(/^\.\//, '')
136
+
137
+ # Find the image in the EPUB archive and extract it
138
+ entry = @parser.zip_file.find_entry(normalized_path)
139
+ if entry
140
+ # Extract the image from the EPUB to the local images directory
141
+ image_path = File.join(images_dir, File.basename(normalized_path))
142
+ File.open(image_path, 'wb') do |f|
143
+ f.write(entry.get_input_stream.read)
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ private
151
+
152
+ # Convert HTML content to Markdown format
153
+ # @param html_content [String] HTML content to convert
154
+ # @return [String] converted Markdown content
155
+ def convert_html_to_markdown(html_content)
156
+ # First, process any image tags in the HTML to handle localization
157
+ processed_html = preprocess_html_images(html_content) if @localize
158
+
159
+ # Use ReverseMarkdown to convert HTML to Markdown
160
+ # Options: GitHub Flavored Markdown compatible
161
+ options = {
162
+ github_flavored: true,
163
+ unknown_tags: :pass_through,
164
+ # Preserve certain tags as HTML
165
+ preserve_tags: ['img', 'video', 'audio']
166
+ }
167
+
168
+ converted = ReverseMarkdown.convert(processed_html || html_content, options)
169
+
170
+ # Clean up extra whitespace
171
+ converted.gsub(/\n{3,}/, "\n\n")
172
+ end
173
+
174
+ private
175
+
176
+ # Preprocess HTML content to handle image localization
177
+ # Downloads external images and extracts internal images to local directory
178
+ # Updates image sources in HTML to point to local copies
179
+ # @param html_content [String] HTML content to process
180
+ # @return [String] processed HTML content with updated image sources
181
+ def preprocess_html_images(html_content)
182
+ # Parse the HTML to find image tags
183
+ doc = Nokogiri::HTML(html_content)
184
+
185
+ # Find all image tags
186
+ doc.css('img').each do |img|
187
+ src = img['src']
188
+ next unless src
189
+
190
+ # Skip if it's already a local reference
191
+ next if src.start_with?('.') || src.start_with?('/')
192
+
193
+ if src.start_with?('http://', 'https://')
194
+ # Handle external images
195
+ begin
196
+ require 'open-uri'
197
+ uri = URI.parse(src)
198
+ filename = File.basename(uri.path)
199
+ filename = "image_#{Time.now.to_i}.jpg" if filename.empty? || filename == '/'
200
+
201
+ image_path = File.join(@images_dir, filename)
202
+
203
+ # Download the image
204
+ open(src, 'rb') do |remote_file|
205
+ File.open(image_path, 'wb') do |local_file|
206
+ local_file.write(remote_file.read)
207
+ end
208
+ end
209
+
210
+ # Update the image source to point to the local copy
211
+ img['src'] = "./images/#{filename}"
212
+ rescue => e
213
+ puts "Failed to download image: #{src} - #{e.message}"
214
+ end
215
+ else
216
+ # Handle internal image references within the EPUB
217
+ begin
218
+ # Extract the image filename from the URL
219
+ filename = File.basename(src)
220
+ image_path = File.join(@images_dir, filename)
221
+
222
+ # Find the image in the EPUB archive and extract it
223
+ # Normalize the path to handle both absolute and relative paths
224
+ normalized_path = src.start_with?('/') ? src[1..-1] : File.join(@parser.root_dir, src).gsub(/^\.\//, '')
225
+
226
+ entry = @parser.zip_file.find_entry(normalized_path)
227
+ if entry
228
+ # Extract the image from the EPUB to the local images directory
229
+ File.open(image_path, 'wb') do |f|
230
+ f.write(entry.get_input_stream.read)
231
+ end
232
+
233
+ # Update the image source to point to the local copy
234
+ img['src'] = "./images/#{filename}"
235
+ end
236
+ rescue => e
237
+ puts "Failed to extract internal image: #{src} - #{e.message}"
238
+ end
239
+ end
240
+ end
241
+
242
+ # Return the modified HTML
243
+ doc.to_html
244
+ end
245
+
246
+ def process_image_links(markdown_content, images_dir)
247
+ # Find all image links in markdown
248
+ markdown_content.gsub(/!\[([^\]]*)\]\(([^)]+)\)/) do |match|
249
+ alt_text = $1
250
+ url = $2
251
+
252
+ # Skip if it's already a local reference
253
+ next match if url.start_with?('.') || url.start_with?('/')
254
+
255
+ if url.start_with?('http://', 'https://')
256
+ # Download the image
257
+ begin
258
+ require 'open-uri'
259
+ uri = URI.parse(url)
260
+ filename = File.basename(uri.path)
261
+ filename = "image_#{Time.now.to_i}.jpg" if filename.empty? || filename == '/'
262
+
263
+ image_path = File.join(images_dir, filename)
264
+
265
+ # Download the image
266
+ open(url, 'rb') do |remote_file|
267
+ File.open(image_path, 'wb') do |local_file|
268
+ local_file.write(remote_file.read)
269
+ end
270
+ end
271
+
272
+ # Return local reference
273
+ "!#{alt_text.empty? ? '' : "[#{alt_text}]"}(./images/#{filename})"
274
+ rescue => e
275
+ puts "Failed to download image: #{url} - #{e.message}"
276
+ match
277
+ end
278
+ else
279
+ # Handle internal image references within the EPUB
280
+ # These are typically relative paths to images inside the EPUB archive
281
+ begin
282
+ # Extract the image filename from the URL
283
+ filename = File.basename(url)
284
+ image_path = File.join(images_dir, filename)
285
+
286
+ # Find the image in the EPUB archive and extract it
287
+ # Access the zip file from the parser instance
288
+ # Normalize the path to handle both absolute and relative paths
289
+ normalized_path = url.start_with?('/') ? url[1..-1] : File.join(@parser.root_dir, url).gsub(/^\.\//, '')
290
+
291
+ entry = @parser.zip_file.find_entry(normalized_path)
292
+ if entry
293
+ # Extract the image from the EPUB to the local images directory
294
+ File.open(image_path, 'wb') do |f|
295
+ f.write(entry.get_input_stream.read)
296
+ end
297
+
298
+ # Return local reference
299
+ "!#{alt_text.empty? ? '' : "[#{alt_text}]"}(./images/#{filename})"
300
+ else
301
+ match
302
+ end
303
+ rescue => e
304
+ puts "Failed to extract internal image: #{url} - #{e.message}"
305
+ match
306
+ end
307
+ end
308
+ end
309
+ end
310
+
311
+ # Sanitize filename by removing invalid characters
312
+ # @param filename [String] original filename
313
+ # @return [String] sanitized filename
314
+ def sanitize_filename(filename)
315
+ # Remove invalid characters for filenames
316
+ filename.gsub(/[<>:"\/\\|?*]/, '_').gsub(/\s+/, '_')
317
+ end
318
+ end
319
+ end
@@ -0,0 +1,251 @@
1
+ require 'zip'
2
+ require 'nokogiri'
3
+ require 'pathname'
4
+
5
+ module Epub2md
6
+ # Parses EPUB files to extract metadata, structure, and content
7
+ class Parser
8
+ attr_reader :epub_path, :zip_file, :opf_path, :root_dir, :manifest, :spine, :metadata, :structure, :sections
9
+
10
+ # Initialize parser with EPUB file path
11
+ # @param epub_path [String] path to EPUB file
12
+ def initialize(epub_path)
13
+ @epub_path = Pathname.new(epub_path)
14
+ raise "EPUB file does not exist: #{epub_path}" unless @epub_path.exist?
15
+
16
+ @zip_file = Zip::File.open(@epub_path.to_s)
17
+ @manifest = {}
18
+ @spine = []
19
+ @metadata = {}
20
+ @structure = []
21
+ @sections = []
22
+ end
23
+
24
+ # Parse the EPUB file and populate internal data structures
25
+ # @return [Parser] self for chaining
26
+ def parse
27
+ @opf_path = get_opf_path
28
+ @root_dir = determine_root_dir(@opf_path)
29
+
30
+ opf_content = read_file(@opf_path)
31
+ opf_doc = Nokogiri::XML(opf_content)
32
+
33
+ parse_package(opf_doc)
34
+ # parse_structure is not a method, it refers to the instance variable @structure
35
+ # which is populated during the parsing process
36
+ parse_sections
37
+
38
+ self
39
+ end
40
+
41
+ private
42
+
43
+ # Get the path to the OPF (Open Packaging Format) file from container.xml
44
+ # @return [String] path to OPF file
45
+ def get_opf_path
46
+ container_content = read_file('META-INF/container.xml')
47
+ container_doc = Nokogiri::XML(container_content)
48
+
49
+ rootfile = container_doc.at_xpath('//xmlns:rootfile[@full-path]')
50
+ rootfile['full-path'] if rootfile
51
+ end
52
+
53
+ # Determine the root directory based on OPF path
54
+ # @param opf_path [String] path to OPF file
55
+ # @return [String] root directory path
56
+ def determine_root_dir(opf_path)
57
+ Pathname.new(opf_path).dirname.to_s
58
+ end
59
+
60
+ # Parse the package document (OPF file) to extract metadata, manifest, spine, and TOC
61
+ # @param doc [Nokogiri::XML::Document] parsed OPF XML document
62
+ def parse_package(doc)
63
+ # Parse metadata
64
+ metadata_node = doc.at_xpath('//opf:metadata', { 'opf' => 'http://www.idpf.org/2007/opf' })
65
+ parse_metadata(metadata_node) if metadata_node
66
+
67
+ # Parse manifest
68
+ manifest_nodes = doc.xpath('//opf:manifest/opf:item', { 'opf' => 'http://www.idpf.org/2007/opf' })
69
+ parse_manifest(manifest_nodes)
70
+
71
+ # Parse spine
72
+ spine_nodes = doc.xpath('//opf:spine/opf:itemref', { 'opf' => 'http://www.idpf.org/2007/opf' })
73
+ parse_spine(spine_nodes)
74
+
75
+ # Parse TOC (Table of Contents)
76
+ parse_toc(doc)
77
+ end
78
+
79
+ # Parse metadata from the OPF document
80
+ # @param node [Nokogiri::XML::Element] metadata node
81
+ def parse_metadata(node)
82
+ # Extract common metadata elements
83
+ @metadata[:title] = get_text_content(node, 'dc:title')
84
+ @metadata[:author] = get_text_content(node, 'dc:creator')
85
+ @metadata[:description] = get_text_content(node, 'dc:description')
86
+ @metadata[:language] = get_text_content(node, 'dc:language')
87
+ @metadata[:publisher] = get_text_content(node, 'dc:publisher')
88
+ @metadata[:rights] = get_text_content(node, 'dc:rights')
89
+ end
90
+
91
+ # Extract text content from a specific XPath within the metadata node
92
+ # @param node [Nokogiri::XML::Element] parent node
93
+ # @param xpath [String] xpath to the desired element
94
+ # @return [String, nil] text content or nil if not found
95
+ def get_text_content(node, xpath)
96
+ element = node.at_xpath(xpath, { 'dc' => 'http://purl.org/dc/elements/1.1/' })
97
+ element.text.strip if element
98
+ rescue
99
+ nil
100
+ end
101
+
102
+ # Parse manifest items from the OPF document
103
+ # @param nodes [Array<Nokogiri::XML::Element>] manifest item nodes
104
+ def parse_manifest(nodes)
105
+ nodes.each do |node|
106
+ id = node['id']
107
+ href = node['href']
108
+ media_type = node['media-type']
109
+
110
+ @manifest[id] = {
111
+ href: href,
112
+ media_type: media_type
113
+ }
114
+ end
115
+ end
116
+
117
+ # Parse spine items from the OPF document
118
+ # @param nodes [Array<Nokogiri::XML::Element>] spine itemref nodes
119
+ def parse_spine(nodes)
120
+ nodes.each_with_index do |node, index|
121
+ idref = node['idref']
122
+ linear = node['linear'] != 'no' # Default is linear unless explicitly marked as non-linear
123
+
124
+ @spine << {
125
+ idref: idref,
126
+ linear: linear,
127
+ position: index
128
+ }
129
+ end
130
+ end
131
+
132
+ # Parse the Table of Contents from either NCX or navigation document
133
+ # @param doc [Nokogiri::XML::Document] parsed OPF XML document
134
+ def parse_toc(doc)
135
+ # Look for NCX file in manifest
136
+ ncx_item = @manifest.find { |_, v| v[:media_type] == 'application/x-dtbncx+xml' }
137
+
138
+ if ncx_item
139
+ # Ensure we have a proper path without leading dot
140
+ ncx_href = ncx_item[1][:href]
141
+ ncx_path = ncx_href.start_with?('/') ? ncx_href[1..-1] : File.join(@root_dir, ncx_href).gsub(/^\.\//, '')
142
+ ncx_content = read_file(ncx_path)
143
+ ncx_doc = Nokogiri::XML(ncx_content)
144
+ parse_ncx_toc(ncx_doc)
145
+ else
146
+ # Fallback to EPUB 3 navigation document
147
+ nav_item = @manifest.find { |_, v| v[:media_type] == 'application/xhtml+xml' && v[:href].include?('nav') }
148
+ if nav_item
149
+ nav_href = nav_item[1][:href]
150
+ nav_path = nav_href.start_with?('/') ? nav_href[1..-1] : File.join(@root_dir, nav_href).gsub(/^\.\//, '')
151
+ nav_content = read_file(nav_path)
152
+ nav_doc = Nokogiri::XML(nav_content)
153
+ parse_nav_toc(nav_doc)
154
+ end
155
+ end
156
+ end
157
+
158
+ def parse_ncx_toc(doc)
159
+ nav_map = doc.at_xpath('//ncx:navMap', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
160
+ return unless nav_map
161
+
162
+ @structure = parse_nav_points(nav_map.xpath('.//ncx:navPoint', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' }))
163
+ end
164
+
165
+ def parse_nav_toc(doc)
166
+ # Parse navigation document for EPUB 3
167
+ nav_elements = doc.xpath('//nav[@epub:type="toc"]//ol/li', { 'epub' => 'http://www.idpf.org/2007/ops' })
168
+ @structure = parse_nav_list(nav_elements)
169
+ end
170
+
171
+ def parse_nav_points(nav_points)
172
+ nav_points.map do |point|
173
+ label = point.at_xpath('.//ncx:navLabel/ncx:text', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
174
+ content = point.at_xpath('.//ncx:content', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
175
+
176
+ nav_point = {
177
+ name: (label.text.strip if label),
178
+ path: (content['src'] if content),
179
+ play_order: point['playorder'],
180
+ children: []
181
+ }
182
+
183
+ child_points = point.xpath('./ncx:navPoint', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
184
+ nav_point[:children] = parse_nav_points(child_points) if child_points.any?
185
+
186
+ nav_point
187
+ end
188
+ end
189
+
190
+ def parse_nav_list(nav_elements)
191
+ nav_elements.map do |element|
192
+ link = element.at_css('a')
193
+ sub_list = element.at_css('ol')
194
+
195
+ nav_item = {
196
+ name: (link.text.strip if link),
197
+ path: (link['href'] if link),
198
+ children: []
199
+ }
200
+
201
+ if sub_list
202
+ sub_items = sub_list.xpath('./li')
203
+ nav_item[:children] = parse_nav_list(sub_items) if sub_items.any?
204
+ end
205
+
206
+ nav_item
207
+ end
208
+ end
209
+
210
+ # Parse sections (chapters) from the spine and manifest
211
+ # Adds section data to the @sections array
212
+ def parse_sections
213
+ @spine.each do |spine_item|
214
+ manifest_item = @manifest[spine_item[:idref]]
215
+ next unless manifest_item && manifest_item[:media_type] == 'application/xhtml+xml'
216
+
217
+ href = manifest_item[:href]
218
+ path = href.start_with?('/') ? href[1..-1] : File.join(@root_dir, href).gsub(/^\.\//, '')
219
+ html_content = read_file(path)
220
+
221
+ @sections << {
222
+ id: spine_item[:idref],
223
+ path: path,
224
+ html_content: html_content,
225
+ title: extract_title_from_html(html_content)
226
+ }
227
+ end
228
+ end
229
+
230
+ def extract_title_from_html(html_content)
231
+ doc = Nokogiri::HTML(html_content)
232
+ title_element = doc.at_css('title')
233
+ title_element.text.strip if title_element
234
+ rescue
235
+ nil
236
+ end
237
+
238
+ # Read a file from the EPUB archive
239
+ # @param path [String] path to file within EPUB
240
+ # @return [String] file content
241
+ def read_file(path)
242
+ # Normalize the path to handle both absolute and relative paths
243
+ normalized_path = path.start_with?('/') ? path[1..-1] : path
244
+ # Use find instead of glob for exact match
245
+ entry = @zip_file.find_entry(normalized_path)
246
+ raise "File not found in EPUB: #{path}" unless entry
247
+
248
+ entry.get_input_stream.read
249
+ end
250
+ end
251
+ end
@@ -0,0 +1,3 @@
1
+ module Epub2md
2
+ VERSION = '0.1.0'
3
+ end
data/lib/epub2md.rb ADDED
@@ -0,0 +1,8 @@
1
+ require_relative 'epub2md/version'
2
+ require_relative 'epub2md/parser'
3
+ require_relative 'epub2md/converter'
4
+
5
+ module Epub2md
6
+ class Error < StandardError; end
7
+ # Your code goes here...
8
+ end
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: epub2md
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - lucas.qin
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rubyzip
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '2.3'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '2.3'
26
+ - !ruby/object:Gem::Dependency
27
+ name: nokogiri
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '1.13'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.13'
40
+ - !ruby/object:Gem::Dependency
41
+ name: reverse_markdown
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.1'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '2.1'
54
+ - !ruby/object:Gem::Dependency
55
+ name: thor
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.2'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.2'
68
+ - !ruby/object:Gem::Dependency
69
+ name: httparty
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '0.21'
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.21'
82
+ - !ruby/object:Gem::Dependency
83
+ name: rspec
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '3.12'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '3.12'
96
+ - !ruby/object:Gem::Dependency
97
+ name: pry
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '0.14'
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.14'
110
+ description: Convert EPUB files to Markdown with support for images, chapters, and
111
+ formatting
112
+ email:
113
+ - qsc1956826@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - Gemfile
119
+ - Gemfile.lock
120
+ - README.md
121
+ - Rakefile
122
+ - bin/epub2md
123
+ - epub2md.gemspec
124
+ - lib/epub2md.rb
125
+ - lib/epub2md/converter.rb
126
+ - lib/epub2md/parser.rb
127
+ - lib/epub2md/version.rb
128
+ homepage: https://github.com/GengCen-Qin/epub2md
129
+ licenses:
130
+ - MIT
131
+ metadata:
132
+ homepage_uri: https://github.com/GengCen-Qin/epub2md
133
+ source_code_uri: https://github.com/GengCen-Qin/epub2md
134
+ changelog_uri: https://github.com/GengCen-Qin/epub2md/blob/main/CHANGELOG.md
135
+ rdoc_options: []
136
+ require_paths:
137
+ - lib
138
+ required_ruby_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 2.6.0
143
+ required_rubygems_version: !ruby/object:Gem::Requirement
144
+ requirements:
145
+ - - ">="
146
+ - !ruby/object:Gem::Version
147
+ version: '0'
148
+ requirements: []
149
+ rubygems_version: 3.6.9
150
+ specification_version: 4
151
+ summary: A Ruby gem to convert EPUB files to Markdown format
152
+ test_files: []