RubyGems - epub2md - Versions diffs - 0.1.0 - Mend

epub2md 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: f4a66cfbc763f53d06686ccce9731cdea76b2d5d7221c84868d8aba1adf2ac2a
+  data.tar.gz: 9ecff2c2013f7a719d5314074b939b96864efbbef1b04ab5b11a6a65f533af4c
+SHA512:
+  metadata.gz: a442a7a1abdce5712934be4f6a5b22018eac0f1502516d9b834e5e4e12c815059276425a4ac4baa2693d7e30e7ced8a7e9eff21ab47458770c7d38b2de9a3556
+  data.tar.gz: d73b7bd071dbe7469a6292c1076f51a24f1c6924cf46e076bafe654be3612f7d870347446806717a6059695bb27b08ca1f118f8e2cd7ca0bd52b2cbc52589848

data/Gemfile ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+source 'https://rubygems.org'
+gem 'rubyzip', '~> 2.3'
+gem 'nokogiri', '~> 1.13'
+gem 'reverse_markdown', '~> 2.1'
+gem 'thor', '~> 1.2'
+gem 'httparty', '~> 0.21'
+group :development, :test do
+  gem 'rspec', '~> 3.12'
+  gem 'pry', '~> 0.14'
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,78 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    bigdecimal (4.0.1)
+    coderay (1.1.3)
+    csv (3.3.5)
+    diff-lcs (1.6.2)
+    httparty (0.24.2)
+      csv
+      mini_mime (>= 1.0.0)
+      multi_xml (>= 0.5.2)
+    io-console (0.8.2)
+    method_source (1.1.0)
+    mini_mime (1.1.5)
+    multi_xml (0.8.1)
+      bigdecimal (>= 3.1, < 5)
+    nokogiri (1.19.0-aarch64-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.0-aarch64-linux-musl)
+      racc (~> 1.4)
+    nokogiri (1.19.0-arm-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.0-arm-linux-musl)
+      racc (~> 1.4)
+    nokogiri (1.19.0-arm64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.19.0-x86_64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.19.0-x86_64-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.0-x86_64-linux-musl)
+      racc (~> 1.4)
+    pry (0.16.0)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+      reline (>= 0.6.0)
+    racc (1.8.1)
+    reline (0.6.3)
+      io-console (~> 0.5)
+    reverse_markdown (2.1.1)
+      nokogiri
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.7)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+    rubyzip (2.4.1)
+    thor (1.5.0)
+PLATFORMS
+  aarch64-linux-gnu
+  aarch64-linux-musl
+  arm-linux-gnu
+  arm-linux-musl
+  arm64-darwin
+  x86_64-darwin
+  x86_64-linux-gnu
+  x86_64-linux-musl
+DEPENDENCIES
+  httparty (~> 0.21)
+  nokogiri (~> 1.13)
+  pry (~> 0.14)
+  reverse_markdown (~> 2.1)
+  rspec (~> 3.12)
+  rubyzip (~> 2.3)
+  thor (~> 1.2)
+BUNDLED WITH
+   2.7.2

data/README.md ADDED Viewed

@@ -0,0 +1,96 @@
+# Ruby EPUB2MD
+A Ruby implementation of the epub2md tool for converting EPUB files to Markdown format.
+## Features
+- Convert EPUB files to multiple Markdown files
+- Convert EPUB files to a single merged Markdown file
+- Extract EPUB metadata and structure information
+- Localize images (extract images from EPUB to local directory when using `--localize` option)
+- Command-line interface for easy usage
+- Programmatic API for integration into other tools
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'epub2md'
+```
+And then execute:
+```bash
+bundle install
+```
+Or install it yourself as:
+```bash
+gem install epub2md
+```
+## Usage
+### Command Line Interface
+```bash
+# Convert EPUB to multiple Markdown files (text only, no images extracted)
+bundle exec ruby bin/epub2md convert path/to/book.epub
+# Convert EPUB to a single Markdown file (text only, no images extracted)
+bundle exec ruby bin/epub2md merge path/to/book.epub
+# Show EPUB information (human-readable format)
+bundle exec ruby bin/epub2md info path/to/book.epub
+# Show EPUB information (JSON format)
+bundle exec ruby bin/epub2md info path/to/book.epub --json
+# Show EPUB structure
+bundle exec ruby bin/epub2md structure path/to/book.epub
+# Show EPUB sections
+bundle exec ruby bin/epub2md sections path/to/book.epub
+# Localize images during conversion (extract images from EPUB to local directory)
+# This creates an 'images' directory with all images from the EPUB
+bundle exec ruby bin/epub2md convert path/to/book.epub --localize
+# Specify output directory
+bundle exec ruby bin/epub2md convert path/to/book.epub --output /custom/output/dir
+```
+**Note**: Image extraction only occurs when using the `--localize` option. Without this option, only text content is converted to Markdown, and images remain embedded in the EPUB file but are not extracted to the output directory.
+### Programmatic Usage
+```ruby
+require 'epub2md'
+# Parse an EPUB file
+parser = Epub2md::Parser.new('path/to/book.epub')
+parser.parse
+# Convert to multiple markdown files
+converter = Epub2md::Converter.new(parser)
+converter.convert_to_markdown(output_dir: './output')
+# Or convert to a single markdown file
+converter.convert_to_single_markdown(output_filename: './output/merged_book.md')
+```
+## Development
+After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rspec` to run the tests.
+To install this gem onto your local machine, run `bundle exec rake install`.
+## Contributing
+Bug reports and pull requests are welcome
+## License
+The gem is available as open source under the terms of the MIT License.

data/Rakefile ADDED Viewed

@@ -0,0 +1,27 @@
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec
+namespace :epub2md do
+  desc "Install dependencies"
+  task :install do
+    sh "bundle install"
+  end
+  desc "Run tests"
+  task :test => :spec
+  desc "Build gem"
+  task :build do
+    sh "gem build epub2md.gemspec"
+  end
+  desc "Install gem locally"
+  task :install_gem => :build do
+    gem_file = Dir.glob("*.gem").first
+    sh "gem install #{gem_file}" if gem_file
+  end
+end

data/bin/epub2md ADDED Viewed

@@ -0,0 +1,145 @@
+#!/usr/bin/env ruby
+require 'thor'
+require_relative '../lib/epub2md'
+module Epub2md
+  # Command-line interface for the EPUB to Markdown converter
+  class CLI < Thor
+    class_option :verbose, type: :boolean, default: false, desc: 'Enable verbose output'
+    # Convert EPUB file to multiple Markdown files
+    desc 'convert PATH', 'Convert EPUB file to multiple Markdown files'
+    option :output, aliases: '-o', type: :string, desc: 'Output directory'
+    option :localize, type: :boolean, default: false, desc: 'Extract images to local directory'
+    def convert(path)
+      validate_epub_path(path)
+      setup_verbose_logging
+      parser = Parser.new(path)
+      parser.parse
+      converter = Converter.new(parser)
+      output_dir = options[:output] || File.dirname(path) + "/#{File.basename(path, '.epub')}_markdown"
+      puts "Converting #{path} to Markdown files..." if options[:verbose]
+      files = converter.convert_to_markdown(
+        localize_images: options[:localize],
+        output_dir: output_dir
+      )
+      puts "Conversion complete! Created #{files.length} Markdown files in: #{output_dir}"
+      files.each { |file| puts "  - #{file}" } if options[:verbose]
+    end
+    desc 'merge PATH', 'Convert EPUB file to a single Markdown file'
+    option :output, aliases: '-o', type: :string, desc: 'Output filename'
+    option :localize, type: :boolean, default: false, desc: 'Download and localize remote images'
+    def merge(path)
+      validate_epub_path(path)
+      setup_verbose_logging
+      parser = Parser.new(path)
+      parser.parse
+      converter = Converter.new(parser)
+      output_file = options[:output] || File.dirname(path) + "/#{File.basename(path, '.epub')}_merged.md"
+      puts "Converting #{path} to single Markdown file..." if options[:verbose]
+      result = converter.convert_to_single_markdown(
+        localize_images: options[:localize],
+        output_filename: output_file
+      )
+      puts "Merged conversion complete! Created: #{result}"
+    end
+    desc 'info PATH', 'Show basic information about the EPUB file'
+    option :json, type: :boolean, default: false, desc: 'Output information in JSON format'
+    def info(path)
+      validate_epub_path(path)
+      parser = Parser.new(path)
+      parser.parse
+      if options[:json]
+        # Output in JSON format
+        info_data = {
+          file: File.basename(path),
+          metadata: parser.metadata,
+          stats: {
+            sections_count: parser.sections.length,
+            manifest_items_count: parser.manifest.length
+          }
+        }
+        require 'json'
+        puts JSON.pretty_generate(info_data)
+      else
+        # Output in human-readable format
+        puts "EPUB Information for: #{File.basename(path)}"
+        puts "Title: #{parser.metadata[:title] || 'Unknown'}"
+        puts "Author: #{parser.metadata[:author] || 'Unknown'}"
+        puts "Language: #{parser.metadata[:language] || 'Unknown'}"
+        puts "Publisher: #{parser.metadata[:publisher] || 'Unknown'}"
+        puts "Description: #{parser.metadata[:description] || 'None'}"
+        puts "Number of sections: #{parser.sections.length}"
+        puts "Number of manifest items: #{parser.manifest.length}"
+      end
+    end
+    desc 'structure PATH', 'Show the structure of the EPUB file'
+    def structure(path)
+      validate_epub_path(path)
+      parser = Parser.new(path)
+      parser.parse
+      puts "EPUB Structure for: #{File.basename(path)}"
+      print_structure(parser.structure)
+    end
+    desc 'sections PATH', 'List all sections/chapters in the EPUB file'
+    def sections(path)
+      validate_epub_path(path)
+      parser = Parser.new(path)
+      parser.parse
+      puts "Sections in: #{File.basename(path)}"
+      parser.sections.each_with_index do |section, index|
+        title = section[:title] || 'Untitled'
+        puts "#{index + 1}. #{title} (ID: #{section[:id]})"
+      end
+    end
+    private
+    def validate_epub_path(path)
+      unless File.exist?(path)
+        puts "Error: File does not exist: #{path}"
+        exit(1)
+      end
+      unless path.end_with?('.epub')
+        puts "Error: File is not an EPUB: #{path}"
+        exit(1)
+      end
+    end
+    def setup_verbose_logging
+      $VERBOSE = true if options[:verbose]
+    end
+    def print_structure(items, level = 0)
+      indent = "  " * level
+      items.each do |item|
+        puts "#{indent}- #{item[:name] || 'Unnamed'} (#{item[:path] || 'No path'})"
+        print_structure(item[:children], level + 1) if item[:children]
+      end
+    end
+  end
+end
+if __FILE__ == $0
+  Epub2md::CLI.start(ARGV)
+end

data/epub2md.gemspec ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+require_relative 'lib/epub2md/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'epub2md'
+  spec.version       = Epub2md::VERSION
+  spec.authors       = ['lucas.qin']
+  spec.email         = ['qsc1956826@gmail.com']
+  spec.summary       = 'A Ruby gem to convert EPUB files to Markdown format'
+  spec.description   = 'Convert EPUB files to Markdown with support for images, chapters, and formatting'
+  spec.homepage      = 'https://github.com/GengCen-Qin/epub2md'
+  spec.license       = 'MIT'
+  spec.required_ruby_version = Gem::Requirement.new('>= 2.6.0')
+  spec.metadata['homepage_uri'] = spec.homepage
+  spec.metadata['source_code_uri'] = spec.homepage
+  spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
+  # Specify which files should be added to the gem when it is released.
+  spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  # Runtime dependencies
+  spec.add_dependency 'rubyzip', '~> 2.3'
+  spec.add_dependency 'nokogiri', '~> 1.13'
+  spec.add_dependency 'reverse_markdown', '~> 2.1'
+  spec.add_dependency 'thor', '~> 1.2'
+  spec.add_dependency 'httparty', '~> 0.21'
+  # Development dependencies
+  spec.add_development_dependency 'rspec', '~> 3.12'
+  spec.add_development_dependency 'pry', '~> 0.14'
+end

data/lib/epub2md/converter.rb ADDED Viewed

@@ -0,0 +1,319 @@
+require 'reverse_markdown'
+module Epub2md
+  # Converts EPUB content to Markdown format
+  class Converter
+    # Initialize converter with a parser instance
+    # @param parser [Parser] EPUB parser instance
+    def initialize(parser)
+      @parser = parser
+    end
+    # Convert EPUB to multiple Markdown files
+    # @param localize_images [Boolean] whether to extract images to local directory
+    # @param output_dir [String] directory to save Markdown files
+    # @return [Array<String>] list of created Markdown file paths
+    def convert_to_markdown(localize_images: false, output_dir: nil)
+      output_dir ||= File.dirname(@parser.epub_path) + "/#{@parser.epub_path.basename('.epub')}_markdown"
+      Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
+      # Create images directory if needed
+      images_dir = File.join(output_dir, 'images')
+      Dir.mkdir(images_dir) unless Dir.exist?(images_dir)
+      # Store the images directory for use in preprocessing
+      @images_dir = images_dir
+      @localize = localize_images
+      # Extract all images from the EPUB to the images directory
+      extract_images_to_directory(images_dir) if localize_images
+      # Convert each section to markdown
+      markdown_files = []
+      @parser.sections.each_with_index do |section, index|
+        # Format the filename with leading zeros based on total sections
+        filename = sprintf("%0#{Math.log10(@parser.sections.length).to_i + 1}d", index + 1)
+        filename += "-#{sanitize_filename(section[:title] || section[:id])}.md"
+        markdown_content = convert_html_to_markdown(section[:html_content])
+        # Process image links if localization is enabled
+        if localize_images
+          markdown_content = process_image_links(markdown_content, images_dir)
+        end
+        file_path = File.join(output_dir, filename)
+        File.write(file_path, markdown_content, encoding: 'UTF-8')
+        markdown_files << file_path
+      end
+      # Clear the stored variables
+      @images_dir = nil
+      @localize = nil
+      markdown_files
+    end
+    def extract_images_to_directory(images_dir)
+      # Extract all image files from the EPUB manifest to the images directory
+      @parser.manifest.each do |id, item|
+        href = item[:href]
+        media_type = item[:media_type]
+        # Check if the item is an image
+        if media_type && media_type.start_with?('image/')
+          # Determine the full path in the EPUB
+          normalized_path = href.start_with?('/') ? href[1..-1] : File.join(@parser.root_dir, href).gsub(/^\.\//, '')
+          # Find the image in the EPUB archive and extract it
+          entry = @parser.zip_file.find_entry(normalized_path)
+          if entry
+            # Extract the image from the EPUB to the local images directory
+            image_path = File.join(images_dir, File.basename(normalized_path))
+            File.open(image_path, 'wb') do |f|
+              f.write(entry.get_input_stream.read)
+            end
+          end
+        end
+      end
+    end
+    # Convert EPUB to a single merged Markdown file
+    # @param localize_images [Boolean] whether to extract images to local directory
+    # @param output_filename [String] path to save merged Markdown file
+    # @return [String] path to the created Markdown file
+    def convert_to_single_markdown(localize_images: false, output_filename: nil)
+      output_filename ||= File.dirname(@parser.epub_path) + "/#{@parser.epub_path.basename('.epub')}_merged.md"
+      # Create images directory if needed
+      images_dir = File.join(File.dirname(output_filename), 'images')
+      Dir.mkdir(images_dir) unless Dir.exist?(images_dir)
+      # Store the images directory for use in preprocessing
+      @images_dir = images_dir
+      @localize = localize_images
+      # Extract all images from the EPUB to the images directory
+      extract_images_to_directory(images_dir) if localize_images
+      markdown_content = ""
+      @parser.sections.each_with_index do |section, index|
+        # Add a header for each section
+        title = section[:title] || section[:id]
+        markdown_content += "# #{title}\n\n"
+        content = convert_html_to_markdown(section[:html_content])
+        # Process image links if localization is enabled
+        if localize_images
+          content = process_image_links(content, images_dir)
+        end
+        markdown_content += content
+        markdown_content += "\n\n---\n\n" unless index == @parser.sections.length - 1
+      end
+      # Clear the stored variables
+      @images_dir = nil
+      @localize = nil
+      File.write(output_filename, markdown_content, encoding: 'UTF-8')
+      output_filename
+    end
+    # Extract all image files from the EPUB manifest to the specified directory
+    # @param images_dir [String] directory to save extracted images
+    def extract_images_to_directory(images_dir)
+      # Extract all image files from the EPUB manifest to the images directory
+      @parser.manifest.each do |id, item|
+        href = item[:href]
+        media_type = item[:media_type]
+        # Check if the item is an image
+        if media_type && media_type.start_with?('image/')
+          # Determine the full path in the EPUB
+          normalized_path = href.start_with?('/') ? href[1..-1] : File.join(@parser.root_dir, href).gsub(/^\.\//, '')
+          # Find the image in the EPUB archive and extract it
+          entry = @parser.zip_file.find_entry(normalized_path)
+          if entry
+            # Extract the image from the EPUB to the local images directory
+            image_path = File.join(images_dir, File.basename(normalized_path))
+            File.open(image_path, 'wb') do |f|
+              f.write(entry.get_input_stream.read)
+            end
+          end
+        end
+      end
+    end
+    private
+    # Convert HTML content to Markdown format
+    # @param html_content [String] HTML content to convert
+    # @return [String] converted Markdown content
+    def convert_html_to_markdown(html_content)
+      # First, process any image tags in the HTML to handle localization
+      processed_html = preprocess_html_images(html_content) if @localize
+      # Use ReverseMarkdown to convert HTML to Markdown
+      # Options: GitHub Flavored Markdown compatible
+      options = {
+        github_flavored: true,
+        unknown_tags: :pass_through,
+        # Preserve certain tags as HTML
+        preserve_tags: ['img', 'video', 'audio']
+      }
+      converted = ReverseMarkdown.convert(processed_html || html_content, options)
+      # Clean up extra whitespace
+      converted.gsub(/\n{3,}/, "\n\n")
+    end
+    private
+    # Preprocess HTML content to handle image localization
+    # Downloads external images and extracts internal images to local directory
+    # Updates image sources in HTML to point to local copies
+    # @param html_content [String] HTML content to process
+    # @return [String] processed HTML content with updated image sources
+    def preprocess_html_images(html_content)
+      # Parse the HTML to find image tags
+      doc = Nokogiri::HTML(html_content)
+      # Find all image tags
+      doc.css('img').each do |img|
+        src = img['src']
+        next unless src
+        # Skip if it's already a local reference
+        next if src.start_with?('.') || src.start_with?('/')
+        if src.start_with?('http://', 'https://')
+          # Handle external images
+          begin
+            require 'open-uri'
+            uri = URI.parse(src)
+            filename = File.basename(uri.path)
+            filename = "image_#{Time.now.to_i}.jpg" if filename.empty? || filename == '/'
+            image_path = File.join(@images_dir, filename)
+            # Download the image
+            open(src, 'rb') do |remote_file|
+              File.open(image_path, 'wb') do |local_file|
+                local_file.write(remote_file.read)
+              end
+            end
+            # Update the image source to point to the local copy
+            img['src'] = "./images/#{filename}"
+          rescue => e
+            puts "Failed to download image: #{src} - #{e.message}"
+          end
+        else
+          # Handle internal image references within the EPUB
+          begin
+            # Extract the image filename from the URL
+            filename = File.basename(src)
+            image_path = File.join(@images_dir, filename)
+            # Find the image in the EPUB archive and extract it
+            # Normalize the path to handle both absolute and relative paths
+            normalized_path = src.start_with?('/') ? src[1..-1] : File.join(@parser.root_dir, src).gsub(/^\.\//, '')
+            entry = @parser.zip_file.find_entry(normalized_path)
+            if entry
+              # Extract the image from the EPUB to the local images directory
+              File.open(image_path, 'wb') do |f|
+                f.write(entry.get_input_stream.read)
+              end
+              # Update the image source to point to the local copy
+              img['src'] = "./images/#{filename}"
+            end
+          rescue => e
+            puts "Failed to extract internal image: #{src} - #{e.message}"
+          end
+        end
+      end
+      # Return the modified HTML
+      doc.to_html
+    end
+    def process_image_links(markdown_content, images_dir)
+      # Find all image links in markdown
+      markdown_content.gsub(/!\[([^\]]*)\]\(([^)]+)\)/) do |match|
+        alt_text = $1
+        url = $2
+        # Skip if it's already a local reference
+        next match if url.start_with?('.') || url.start_with?('/')
+        if url.start_with?('http://', 'https://')
+          # Download the image
+          begin
+            require 'open-uri'
+            uri = URI.parse(url)
+            filename = File.basename(uri.path)
+            filename = "image_#{Time.now.to_i}.jpg" if filename.empty? || filename == '/'
+            image_path = File.join(images_dir, filename)
+            # Download the image
+            open(url, 'rb') do |remote_file|
+              File.open(image_path, 'wb') do |local_file|
+                local_file.write(remote_file.read)
+              end
+            end
+            # Return local reference
+            "!#{alt_text.empty? ? '' : "[#{alt_text}]"}(./images/#{filename})"
+          rescue => e
+            puts "Failed to download image: #{url} - #{e.message}"
+            match
+          end
+        else
+          # Handle internal image references within the EPUB
+          # These are typically relative paths to images inside the EPUB archive
+          begin
+            # Extract the image filename from the URL
+            filename = File.basename(url)
+            image_path = File.join(images_dir, filename)
+            # Find the image in the EPUB archive and extract it
+            # Access the zip file from the parser instance
+            # Normalize the path to handle both absolute and relative paths
+            normalized_path = url.start_with?('/') ? url[1..-1] : File.join(@parser.root_dir, url).gsub(/^\.\//, '')
+            entry = @parser.zip_file.find_entry(normalized_path)
+            if entry
+              # Extract the image from the EPUB to the local images directory
+              File.open(image_path, 'wb') do |f|
+                f.write(entry.get_input_stream.read)
+              end
+              # Return local reference
+              "!#{alt_text.empty? ? '' : "[#{alt_text}]"}(./images/#{filename})"
+            else
+              match
+            end
+          rescue => e
+            puts "Failed to extract internal image: #{url} - #{e.message}"
+            match
+          end
+        end
+      end
+    end
+    # Sanitize filename by removing invalid characters
+    # @param filename [String] original filename
+    # @return [String] sanitized filename
+    def sanitize_filename(filename)
+      # Remove invalid characters for filenames
+      filename.gsub(/[<>:"\/\\|?*]/, '_').gsub(/\s+/, '_')
+    end
+  end
+end

data/lib/epub2md/parser.rb ADDED Viewed

@@ -0,0 +1,251 @@
+require 'zip'
+require 'nokogiri'
+require 'pathname'
+module Epub2md
+  # Parses EPUB files to extract metadata, structure, and content
+  class Parser
+    attr_reader :epub_path, :zip_file, :opf_path, :root_dir, :manifest, :spine, :metadata, :structure, :sections
+    # Initialize parser with EPUB file path
+    # @param epub_path [String] path to EPUB file
+    def initialize(epub_path)
+      @epub_path = Pathname.new(epub_path)
+      raise "EPUB file does not exist: #{epub_path}" unless @epub_path.exist?
+      @zip_file = Zip::File.open(@epub_path.to_s)
+      @manifest = {}
+      @spine = []
+      @metadata = {}
+      @structure = []
+      @sections = []
+    end
+    # Parse the EPUB file and populate internal data structures
+    # @return [Parser] self for chaining
+    def parse
+      @opf_path = get_opf_path
+      @root_dir = determine_root_dir(@opf_path)
+      opf_content = read_file(@opf_path)
+      opf_doc = Nokogiri::XML(opf_content)
+      parse_package(opf_doc)
+      # parse_structure is not a method, it refers to the instance variable @structure
+      # which is populated during the parsing process
+      parse_sections
+      self
+    end
+    private
+    # Get the path to the OPF (Open Packaging Format) file from container.xml
+    # @return [String] path to OPF file
+    def get_opf_path
+      container_content = read_file('META-INF/container.xml')
+      container_doc = Nokogiri::XML(container_content)
+      rootfile = container_doc.at_xpath('//xmlns:rootfile[@full-path]')
+      rootfile['full-path'] if rootfile
+    end
+    # Determine the root directory based on OPF path
+    # @param opf_path [String] path to OPF file
+    # @return [String] root directory path
+    def determine_root_dir(opf_path)
+      Pathname.new(opf_path).dirname.to_s
+    end
+    # Parse the package document (OPF file) to extract metadata, manifest, spine, and TOC
+    # @param doc [Nokogiri::XML::Document] parsed OPF XML document
+    def parse_package(doc)
+      # Parse metadata
+      metadata_node = doc.at_xpath('//opf:metadata', { 'opf' => 'http://www.idpf.org/2007/opf' })
+      parse_metadata(metadata_node) if metadata_node
+      # Parse manifest
+      manifest_nodes = doc.xpath('//opf:manifest/opf:item', { 'opf' => 'http://www.idpf.org/2007/opf' })
+      parse_manifest(manifest_nodes)
+      # Parse spine
+      spine_nodes = doc.xpath('//opf:spine/opf:itemref', { 'opf' => 'http://www.idpf.org/2007/opf' })
+      parse_spine(spine_nodes)
+      # Parse TOC (Table of Contents)
+      parse_toc(doc)
+    end
+    # Parse metadata from the OPF document
+    # @param node [Nokogiri::XML::Element] metadata node
+    def parse_metadata(node)
+      # Extract common metadata elements
+      @metadata[:title] = get_text_content(node, 'dc:title')
+      @metadata[:author] = get_text_content(node, 'dc:creator')
+      @metadata[:description] = get_text_content(node, 'dc:description')
+      @metadata[:language] = get_text_content(node, 'dc:language')
+      @metadata[:publisher] = get_text_content(node, 'dc:publisher')
+      @metadata[:rights] = get_text_content(node, 'dc:rights')
+    end
+    # Extract text content from a specific XPath within the metadata node
+    # @param node [Nokogiri::XML::Element] parent node
+    # @param xpath [String] xpath to the desired element
+    # @return [String, nil] text content or nil if not found
+    def get_text_content(node, xpath)
+      element = node.at_xpath(xpath, { 'dc' => 'http://purl.org/dc/elements/1.1/' })
+      element.text.strip if element
+    rescue
+      nil
+    end
+    # Parse manifest items from the OPF document
+    # @param nodes [Array<Nokogiri::XML::Element>] manifest item nodes
+    def parse_manifest(nodes)
+      nodes.each do |node|
+        id = node['id']
+        href = node['href']
+        media_type = node['media-type']
+        @manifest[id] = {
+          href: href,
+          media_type: media_type
+        }
+      end
+    end
+    # Parse spine items from the OPF document
+    # @param nodes [Array<Nokogiri::XML::Element>] spine itemref nodes
+    def parse_spine(nodes)
+      nodes.each_with_index do |node, index|
+        idref = node['idref']
+        linear = node['linear'] != 'no' # Default is linear unless explicitly marked as non-linear
+        @spine << {
+          idref: idref,
+          linear: linear,
+          position: index
+        }
+      end
+    end
+    # Parse the Table of Contents from either NCX or navigation document
+    # @param doc [Nokogiri::XML::Document] parsed OPF XML document
+    def parse_toc(doc)
+      # Look for NCX file in manifest
+      ncx_item = @manifest.find { |_, v| v[:media_type] == 'application/x-dtbncx+xml' }
+      if ncx_item
+        # Ensure we have a proper path without leading dot
+        ncx_href = ncx_item[1][:href]
+        ncx_path = ncx_href.start_with?('/') ? ncx_href[1..-1] : File.join(@root_dir, ncx_href).gsub(/^\.\//, '')
+        ncx_content = read_file(ncx_path)
+        ncx_doc = Nokogiri::XML(ncx_content)
+        parse_ncx_toc(ncx_doc)
+      else
+        # Fallback to EPUB 3 navigation document
+        nav_item = @manifest.find { |_, v| v[:media_type] == 'application/xhtml+xml' && v[:href].include?('nav') }
+        if nav_item
+          nav_href = nav_item[1][:href]
+          nav_path = nav_href.start_with?('/') ? nav_href[1..-1] : File.join(@root_dir, nav_href).gsub(/^\.\//, '')
+          nav_content = read_file(nav_path)
+          nav_doc = Nokogiri::XML(nav_content)
+          parse_nav_toc(nav_doc)
+        end
+      end
+    end
+    def parse_ncx_toc(doc)
+      nav_map = doc.at_xpath('//ncx:navMap', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
+      return unless nav_map
+      @structure = parse_nav_points(nav_map.xpath('.//ncx:navPoint', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' }))
+    end
+    def parse_nav_toc(doc)
+      # Parse navigation document for EPUB 3
+      nav_elements = doc.xpath('//nav[@epub:type="toc"]//ol/li', { 'epub' => 'http://www.idpf.org/2007/ops' })
+      @structure = parse_nav_list(nav_elements)
+    end
+    def parse_nav_points(nav_points)
+      nav_points.map do |point|
+        label = point.at_xpath('.//ncx:navLabel/ncx:text', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
+        content = point.at_xpath('.//ncx:content', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
+        nav_point = {
+          name: (label.text.strip if label),
+          path: (content['src'] if content),
+          play_order: point['playorder'],
+          children: []
+        }
+        child_points = point.xpath('./ncx:navPoint', { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' })
+        nav_point[:children] = parse_nav_points(child_points) if child_points.any?
+        nav_point
+      end
+    end
+    def parse_nav_list(nav_elements)
+      nav_elements.map do |element|
+        link = element.at_css('a')
+        sub_list = element.at_css('ol')
+        nav_item = {
+          name: (link.text.strip if link),
+          path: (link['href'] if link),
+          children: []
+        }
+        if sub_list
+          sub_items = sub_list.xpath('./li')
+          nav_item[:children] = parse_nav_list(sub_items) if sub_items.any?
+        end
+        nav_item
+      end
+    end
+    # Parse sections (chapters) from the spine and manifest
+    # Adds section data to the @sections array
+    def parse_sections
+      @spine.each do |spine_item|
+        manifest_item = @manifest[spine_item[:idref]]
+        next unless manifest_item && manifest_item[:media_type] == 'application/xhtml+xml'
+        href = manifest_item[:href]
+        path = href.start_with?('/') ? href[1..-1] : File.join(@root_dir, href).gsub(/^\.\//, '')
+        html_content = read_file(path)
+        @sections << {
+          id: spine_item[:idref],
+          path: path,
+          html_content: html_content,
+          title: extract_title_from_html(html_content)
+        }
+      end
+    end
+    def extract_title_from_html(html_content)
+      doc = Nokogiri::HTML(html_content)
+      title_element = doc.at_css('title')
+      title_element.text.strip if title_element
+    rescue
+      nil
+    end
+    # Read a file from the EPUB archive
+    # @param path [String] path to file within EPUB
+    # @return [String] file content
+    def read_file(path)
+      # Normalize the path to handle both absolute and relative paths
+      normalized_path = path.start_with?('/') ? path[1..-1] : path
+      # Use find instead of glob for exact match
+      entry = @zip_file.find_entry(normalized_path)
+      raise "File not found in EPUB: #{path}" unless entry
+      entry.get_input_stream.read
+    end
+  end
+end

data/lib/epub2md/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Epub2md
+  VERSION = '0.1.0'
+end

data/lib/epub2md.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require_relative 'epub2md/version'
+require_relative 'epub2md/parser'
+require_relative 'epub2md/converter'
+module Epub2md
+  class Error < StandardError; end
+  # Your code goes here...
+end

metadata ADDED Viewed

@@ -0,0 +1,152 @@
+--- !ruby/object:Gem::Specification
+name: epub2md
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- lucas.qin
+bindir: exe
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rubyzip
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.3'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.13'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.13'
+- !ruby/object:Gem::Dependency
+  name: reverse_markdown
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.1'
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+- !ruby/object:Gem::Dependency
+  name: httparty
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.21'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.21'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.12'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.12'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.14'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.14'
+description: Convert EPUB files to Markdown with support for images, chapters, and
+  formatting
+email:
+- qsc1956826@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- Gemfile
+- Gemfile.lock
+- README.md
+- Rakefile
+- bin/epub2md
+- epub2md.gemspec
+- lib/epub2md.rb
+- lib/epub2md/converter.rb
+- lib/epub2md/parser.rb
+- lib/epub2md/version.rb
+homepage: https://github.com/GengCen-Qin/epub2md
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/GengCen-Qin/epub2md
+  source_code_uri: https://github.com/GengCen-Qin/epub2md
+  changelog_uri: https://github.com/GengCen-Qin/epub2md/blob/main/CHANGELOG.md
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.6.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.6.9
+specification_version: 4
+summary: A Ruby gem to convert EPUB files to Markdown format
+test_files: []