RubyGems - epub_tools - Versions diffs - 0.4.1 → 0.6.0 - Mend

epub_tools 0.4.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +3 -0
data/.rubocop.yml +10 -17
data/CLAUDE.md +128 -0
data/Gemfile +4 -4
data/Gemfile.lock +39 -34
data/README.md +37 -24
data/Rakefile +2 -0
data/bin/epub-tools +2 -0
data/epub_tools.gemspec +3 -1
data/lib/epub_tools/add_chapters.rb +64 -33
data/lib/epub_tools/append_book.rb +81 -0
data/lib/epub_tools/book_builder.rb +108 -0
data/lib/epub_tools/chapter_marker_detector.rb +46 -0
data/lib/epub_tools/chapter_validator.rb +50 -0
data/lib/epub_tools/cli/command_options_configurator.rb +128 -0
data/lib/epub_tools/cli/command_registry.rb +2 -0
data/lib/epub_tools/cli/option_builder.rb +5 -3
data/lib/epub_tools/cli/runner.rb +60 -110
data/lib/epub_tools/cli.rb +17 -29
data/lib/epub_tools/compile_book.rb +15 -146
data/lib/epub_tools/compile_workspace.rb +40 -0
data/lib/epub_tools/epub_configuration.rb +33 -0
data/lib/epub_tools/epub_file_writer.rb +57 -0
data/lib/epub_tools/epub_initializer.rb +83 -162
data/lib/epub_tools/epub_metadata_builder.rb +92 -0
data/lib/epub_tools/loggable.rb +2 -0
data/lib/epub_tools/pack_ebook.rb +28 -14
data/lib/epub_tools/split_chapters.rb +44 -56
data/lib/epub_tools/style_finder.rb +17 -6
data/lib/epub_tools/unpack_ebook.rb +20 -10
data/lib/epub_tools/version.rb +3 -1
data/lib/epub_tools/xhtml_cleaner.rb +1 -0
data/lib/epub_tools/xhtml_extractor.rb +20 -10
data/lib/epub_tools/xhtml_generator.rb +71 -0
data/lib/epub_tools.rb +5 -0
data/test/add_chapters_test.rb +119 -25
data/test/append_book_test.rb +127 -0
data/test/chapter_validator_test.rb +74 -0
data/test/cli/command_registry_test.rb +2 -0
data/test/cli/option_builder_test.rb +24 -14
data/test/cli/runner_test.rb +15 -15
data/test/cli_commands_test.rb +11 -0
data/test/cli_test.rb +2 -0
data/test/cli_version_test.rb +2 -0
data/test/compile_book_test.rb +16 -102
data/test/compile_workspace_test.rb +55 -0
data/test/epub_initializer_test.rb +55 -27
data/test/pack_ebook_test.rb +33 -9
data/test/split_chapters_test.rb +96 -7
data/test/style_finder_test.rb +2 -0
data/test/test_helper.rb +2 -0
data/test/unpack_ebook_test.rb +45 -20
data/test/xhtml_cleaner_test.rb +2 -0
data/test/xhtml_extractor_test.rb +3 -1
metadata +17 -3

data/lib/epub_tools/epub_metadata_builder.rb ADDED Viewed

@@ -0,0 +1,92 @@
+# frozen_string_literal: true
+module EpubTools
+  # Builds metadata content for EPUB package.opf files
+  class EpubMetadataBuilder
+    def initialize(config)
+      @config = config
+    end
+    # Builds complete metadata array
+    def build_metadata
+      metadata = []
+      add_dublin_core_metadata(metadata)
+      add_schema_metadata(metadata)
+      add_cover_metadata(metadata) if @config.cover_image_fname
+      metadata
+    end
+    # Builds manifest and spine items
+    def build_manifest_and_spine
+      manifest_items = []
+      spine_items = []
+      add_base_manifest_items(manifest_items)
+      add_cover_items(manifest_items, spine_items) if @config.cover_image_fname
+      add_title_items(manifest_items, spine_items)
+      [manifest_items, spine_items]
+    end
+    # Builds complete OPF XML content
+    def build_opf_xml(metadata, manifest_items, spine_items)
+      <<~XML
+        <?xml version="1.0" encoding="utf-8"?>
+        <package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="pub-id" xml:lang="en">
+          <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+        #{metadata.map { |line| "    #{line}" }.join("\n")}
+          </metadata>
+          <manifest>
+        #{manifest_items.map { |line| "    #{line}" }.join("\n")}
+          </manifest>
+          <spine>
+        #{spine_items.map { |line| "    #{line}" }.join("\n")}
+          </spine>
+        </package>
+      XML
+    end
+    private
+    def add_dublin_core_metadata(metadata)
+      metadata << %(<dc:identifier id="pub-id">#{@config.uuid}</dc:identifier>)
+      metadata << %(<dc:title>#{@config.title}</dc:title>)
+      metadata << %(<dc:creator>#{@config.author}</dc:creator>)
+      metadata << '<dc:language>en</dc:language>'
+      metadata << %(<meta property="dcterms:modified">#{@config.modified}</meta>)
+    end
+    def add_schema_metadata(metadata)
+      metadata << %(<meta property="schema:accessMode">textual</meta>)
+      metadata << %(<meta property="schema:accessibilityFeature">unknown</meta>)
+      metadata << %(<meta property="schema:accessibilityHazard">none</meta>)
+      metadata << %(<meta property="schema:accessModeSufficient">textual</meta>)
+    end
+    def add_cover_metadata(metadata)
+      metadata << %(<meta name="cover" content="cover-image"/>)
+    end
+    def add_base_manifest_items(manifest_items)
+      manifest_items << mitem('style', 'style.css', 'text/css')
+      manifest_items << mitem('nav', 'nav.xhtml', 'application/xhtml+xml', 'nav')
+    end
+    def add_cover_items(manifest_items, spine_items)
+      manifest_items << mitem('cover-image', @config.cover_image_fname, @config.cover_image_media_type, 'cover-image')
+      manifest_items << mitem('cover-page', 'cover.xhtml', 'application/xhtml+xml')
+      spine_items << '<itemref idref="cover-page"/>'
+    end
+    def add_title_items(manifest_items, spine_items)
+      manifest_items << mitem('title', 'title.xhtml', 'application/xhtml+xml')
+      spine_items << '<itemref idref="title"/>'
+    end
+    def mitem(id, href, type, properties = nil)
+      xml = "<item id=\"#{id}\" href=\"#{href}\" media-type=\"#{type}\""
+      xml += " properties=\"#{properties}\"" if properties
+      "#{xml}/>"
+    end
+  end
+end

data/lib/epub_tools/loggable.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module EpubTools
   # Provides logging capability to classes that include it
   module Loggable

data/lib/epub_tools/pack_ebook.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'zip'
 require 'fileutils'
 require 'pathname'
@@ -7,6 +9,7 @@ module EpubTools
   # Packages an EPUB directory into a .epub file
   class PackEbook
     include Loggable
     # Initializes the class
     # @param options [Hash] Configuration options
     # @option options [String] :input_dir Path to the EPUB directory (containing mimetype, META-INF, OEBPS) (required)
@@ -28,21 +31,9 @@ module EpubTools
     def run
       validate_input!
       Dir.chdir(@input_dir) do
-        # determine the output path: absolute stays as-is, otherwise sibling to input_dir
-        target = Pathname.new(@output_file).absolute? ? @output_file : File.join('..', @output_file)
+        target = determine_output_path
         FileUtils.rm_f(target)
-        Zip::File.open(target, Zip::File::CREATE) do |zip|
-          # Add mimetype first and uncompressed
-          add_mimetype(zip)
-          # Add all other files with compression, preserving paths
-          Dir.glob('**/*', File::FNM_DOTMATCH).sort.each do |entry|
-            next if ['.', '..', 'mimetype'].include?(entry)
-            next if File.directory?(entry)
-            zip.add(entry, entry)
-          end
-        end
+        create_zip_file(target)
       end
       log "EPUB created: #{@output_file}"
       @output_file
@@ -50,6 +41,29 @@ module EpubTools
     private
+    def determine_output_path
+      # determine the output path: absolute stays as-is, otherwise sibling to input_dir
+      Pathname.new(@output_file).absolute? ? @output_file : File.join('..', @output_file)
+    end
+    def create_zip_file(target)
+      Zip::File.open(target, create: true) do |zip|
+        # Add mimetype first and uncompressed
+        add_mimetype(zip)
+        add_content_files(zip)
+      end
+    end
+    def add_content_files(zip)
+      # Add all other files with compression, preserving paths
+      Dir.glob('**/*', File::FNM_DOTMATCH).sort.each do |entry|
+        next if ['.', '..', 'mimetype'].include?(entry)
+        next if File.directory?(entry)
+        zip.add(entry, entry)
+      end
+    end
     def validate_input!
       raise ArgumentError, "Directory '#{@input_dir}' does not exist." unless Dir.exist?(@input_dir)

data/lib/epub_tools/split_chapters.rb CHANGED Viewed

@@ -1,51 +1,35 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 require 'nokogiri'
 require 'yaml'
 require 'fileutils'
 require_relative 'loggable'
 require_relative 'style_finder'
 require_relative 'xhtml_cleaner'
+require_relative 'chapter_marker_detector'
 module EpubTools
-  # Takes a Google Docs generated, already extracted from their EPUB, XHTML files with multiple
-  # chapters and it:
-  # - Extracts classes using {StyleFinder}[rdoc-ref:EpubTools::StyleFinder]
-  # - Looks for tags that say something like Chapter XX or Prologue and splits the text there
-  # - Creates new chapter_XX.xhtml files that are cleaned using
-  #   {XHTMLCleaner}[rdoc-ref:EpubTools::XHTMLCleaner]
-  # - Saves those files to +output_dir+
+  # Splits a multi-chapter XHTML file into individual chapter files.
   class SplitChapters
     include Loggable
-    # Initializes the class
-    # @param options [Hash] Configuration options
-    # @option options [String] :input_file Path to the source XHTML (required)
-    # @option options [String] :book_title Title to use in HTML <title> tags (required)
-    # @option options [String] :output_dir Where to write chapter files (default: './chapters')
-    # @option options [String] :output_prefix Filename prefix for chapter files (default: 'chapter')
-    # @option options [Boolean] :verbose Whether to print progress to STDOUT (default: false)
     def initialize(options = {})
       @input_file    = options.fetch(:input_file)
       @book_title    = options.fetch(:book_title)
       @output_dir    = options[:output_dir] || './chapters'
       @output_prefix = options[:output_prefix] || 'chapter'
       @verbose       = options[:verbose] || false
+      @detector      = ChapterMarkerDetector.new
     end
     # Runs the splitter
     # @return [Array<String>] List of generated chapter file paths
     def run
-      # Prepare output dir
       FileUtils.mkdir_p(@output_dir)
-      # Read the doc
-      raw_content = read_and_strip_problematic_tags
-      doc = Nokogiri::HTML(raw_content)
-      # Find Style Classes
+      doc = Nokogiri::HTML(read_and_strip_problematic_tags)
       StyleFinder.new({ file_path: @input_file, verbose: @verbose }).run
-      chapters = extract_chapters(doc)
-      write_chapter_files(chapters)
+      extract_chapters(doc).map { |number, content| write_chapter_file(number, content) }
     end
     private
@@ -60,38 +44,47 @@ module EpubTools
       current_fragment = nil
       doc.at('body').children.each do |node|
-        if (m = node.text.match(/Chapter\s+(\d+)/i)) && %w[p span h2 h3 h4].include?(node.name)
-          # start a new chapter (skip the marker node so title isn't duplicated)
-          chapters[current_number] = current_fragment.to_html if current_number
-          current_number = m[1].to_i
-          current_fragment = Nokogiri::HTML::DocumentFragment.parse('')
-        elsif prologue_marker?(node)
-          # start the prologue (skip the marker node)
-          chapters[current_number] = current_fragment.to_html if current_number
-          current_number = 0
-          current_fragment = Nokogiri::HTML::DocumentFragment.parse('')
-        else
-          current_fragment&.add_child(node.dup)
-        end
+        current_number, current_fragment = process_node(node, chapters, current_number, current_fragment)
       end
       chapters[current_number] = current_fragment.to_html if current_number
       chapters
     end
-    def write_chapter_files(chapters)
-      chapter_files = []
-      chapters.each do |number, content|
-        filename = write_chapter_file(number, content)
-        chapter_files << filename
+    def process_node(node, chapters, current_number, current_fragment)
+      marker = @detector.detect(node)
+      if marker
+        start_chapter(chapters, marker_number(marker, node), current_number, current_fragment)
+      else
+        current_fragment&.add_child(node.dup)
+        [current_number, current_fragment]
       end
-      chapter_files
+    end
+    def marker_number(marker, node)
+      case marker
+      when :continued then @detector.extract_chapter_number(node) + 0.5
+      when :chapter then @detector.extract_chapter_number(node)
+      when :prologue then 0
+      end
+    end
+    def start_chapter(chapters, number, current_number, current_fragment)
+      chapters[current_number] = current_fragment.to_html if current_number
+      [number, Nokogiri::HTML::DocumentFragment.parse('')]
     end
     def write_chapter_file(label, content)
-      display_label = display_label(label)
-      filename = File.join(@output_dir, "#{@output_prefix}_#{label}.xhtml")
-      File.write(filename, <<~HTML)
+      display = display_label(label)
+      filename = File.join(@output_dir, "#{@output_prefix}_#{file_label(label)}.xhtml")
+      File.write(filename, build_xhtml_template(display, content))
+      XHTMLCleaner.new({ filename: filename }).run
+      log("Extracted: #{filename}")
+      filename
+    end
+    def build_xhtml_template(display_label, content)
+      <<~HTML
         <?xml version="1.0" encoding="UTF-8"?>
         <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
           <head>
@@ -104,21 +97,16 @@ module EpubTools
           </body>
         </html>
       HTML
-      XHTMLCleaner.new({ filename: filename }).run
-      log("Extracted: #{filename}")
-      filename
     end
-    def display_label(label)
-      label.positive? ? "Chapter #{label}" : 'Prologue'
+    def file_label(label)
+      label.is_a?(Float) ? label.to_s.gsub('.', '_') : label.to_s
     end
-    # Detect a bolded Prologue marker
-    def prologue_marker?(node)
-      return false unless %w[h3 h4].include?(node.name)
-      return false unless node.text.strip =~ /\APrologue\z/i
+    def display_label(label)
+      return 'Prologue' if label.zero?
-      true
+      "Chapter #{label}"
     end
   end
 end

data/lib/epub_tools/style_finder.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 require 'nokogiri'
 require 'yaml'
 require_relative 'loggable'
@@ -9,6 +11,7 @@ module EpubTools
   # {SplitChapters}[rdoc-ref:EpubTools::SplitChapters].
   class StyleFinder
     include Loggable
     # Initializes the class
     # @param options [Hash] Configuration options
     # @option options [String] :file_path XHTML file to be analyzed (required)
@@ -24,18 +27,26 @@ module EpubTools
     # Runs the finder
     # @return [Hash] Data containing the extracted style classes (italics and bolds)
     def run
+      style_blocks = extract_style_blocks
+      italics, bolds = extract_style_classes(style_blocks)
+      generate_output(italics, bolds)
+    end
+    def extract_style_blocks
       doc = Nokogiri::HTML(File.read(@file_path))
-      style_blocks = doc.xpath('//style').map(&:text).join("\n")
+      doc.xpath('//style').map(&:text).join("\n")
+    end
+    def extract_style_classes(style_blocks)
       italics = extract_classes(style_blocks, /font-style\s*:\s*italic/)
-      bolds   = extract_classes(style_blocks, /font-weight\s*:\s*700/)
+      bolds = extract_classes(style_blocks, /font-weight\s*:\s*700/)
+      [italics, bolds]
+    end
+    def generate_output(italics, bolds)
       print_summary(italics, bolds) if @verbose
-      data = {
-        'italics' => italics,
-        'bolds' => bolds
-      }
+      data = { 'italics' => italics, 'bolds' => bolds }
       File.write(@output_path, data.to_yaml)
       data
     end

data/lib/epub_tools/unpack_ebook.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'zip'
 require 'fileutils'
 require_relative 'loggable'
@@ -6,6 +8,7 @@ module EpubTools
   # Unpacks an EPUB (.epub file) into a directory
   class UnpackEbook
     include Loggable
     # Initializes the class
     # @param options [Hash] Configuration options
     # @option options [String] :epub_file Path to the .epub file to unpack (required)
@@ -23,22 +26,29 @@ module EpubTools
     def run
       validate!
       FileUtils.mkdir_p(@output_dir)
+      extract_entries
+      log "Unpacked #{File.basename(@epub_file)} to #{@output_dir}"
+      @output_dir
+    end
+    private
+    def extract_entries
       Zip::File.open(@epub_file) do |zip|
         zip.each do |entry|
-          dest_path = File.join(@output_dir, entry.name)
-          if entry.directory?
-            FileUtils.mkdir_p(dest_path)
-          else
-            FileUtils.mkdir_p(File.dirname(dest_path))
-            entry.extract(dest_path) { true }
-          end
+          extract_entry(entry)
         end
       end
-      log "Unpacked #{File.basename(@epub_file)} to #{@output_dir}"
-      @output_dir
     end
-    private
+    def extract_entry(entry)
+      if entry.directory?
+        FileUtils.mkdir_p(File.join(@output_dir, entry.name))
+      else
+        FileUtils.mkdir_p(File.join(@output_dir, File.dirname(entry.name)))
+        entry.extract(destination_directory: @output_dir) { true }
+      end
+    end
     def default_dir
       [File.dirname(@epub_file), File.basename(@epub_file, '.epub')].join('/')

data/lib/epub_tools/version.rb CHANGED Viewed

@@ -1,4 +1,6 @@
+# frozen_string_literal: true
 module EpubTools
   # Ruby Gem version number
-  VERSION = '0.4.1'.freeze
+  VERSION = '0.6.0'
 end

data/lib/epub_tools/xhtml_cleaner.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 require 'nokogiri'
 require 'yaml'

data/lib/epub_tools/xhtml_extractor.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'zip'
 require 'fileutils'
 require_relative 'loggable'
@@ -6,6 +8,7 @@ module EpubTools
   # Extracts text .xhtml files from EPUB archives, excluding nav.xhtml
   class XHTMLExtractor
     include Loggable
     # Initializes the class
     # @param options [Hash] Configuration options
     # @option options [String] :source_dir Directory containing source .epub files (required)
@@ -39,21 +42,28 @@ module EpubTools
       epub_name = File.basename(epub_path, '.epub')
       log "Extracting from #{epub_name}.epub"
       extracted_files = []
       Zip::File.open(epub_path) do |zip_file|
-        zip_file.each do |entry|
-          next unless entry.name.downcase.end_with?('.xhtml')
-          next if File.basename(entry.name).downcase == 'nav.xhtml'
-          output_path = File.join(@target_dir, "#{epub_name}_#{File.basename(entry.name)}")
-          FileUtils.mkdir_p(File.dirname(output_path))
-          entry.extract(output_path) { true }
-          log output_path
-          extracted_files << output_path
-        end
+        zip_file.each { |entry| extract_entry_if_xhtml(entry, epub_name, extracted_files) }
       end
       extracted_files
     rescue Zip::Error => e
       warn "⚠️ Failed to process #{epub_path}: #{e.message}"
     end
+    def extract_entry_if_xhtml(entry, epub_name, extracted_files)
+      return unless xhtml_entry?(entry)
+      renamed = "#{epub_name}_#{File.basename(entry.name)}"
+      output_path = File.join(@target_dir, renamed)
+      FileUtils.mkdir_p(File.dirname(output_path))
+      entry.extract(renamed, destination_directory: @target_dir) { true }
+      log output_path
+      extracted_files << output_path
+    end
+    def xhtml_entry?(entry)
+      entry.name.downcase.end_with?('.xhtml') && File.basename(entry.name).downcase != 'nav.xhtml'
+    end
   end
 end

data/lib/epub_tools/xhtml_generator.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+module EpubTools
+  # Generates XHTML content for EPUB files
+  class XhtmlGenerator
+    attr_accessor :cover_image_fname
+    def initialize(title:, author:)
+      @title = title
+      @author = author
+      @cover_image_fname = nil
+    end
+    # Generates title page XHTML content
+    def build_title_page
+      <<~XHTML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
+          <head>
+            <meta charset="UTF-8" />
+            <title>#{@title}</title>
+            <link rel="stylesheet" type="text/css" href="style.css"/>
+          </head>
+          <body>
+            <h1 class="title">#{@title}</h1>
+            <p class="author">by #{@author}</p>
+          </body>
+        </html>
+      XHTML
+    end
+    # Generates cover page XHTML content
+    def build_cover_page
+      <<~XHTML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
+          <head>
+            <meta charset="UTF-8" />
+            <title>Cover</title>
+            <link rel="stylesheet" type="text/css" href="style.css"/>
+          </head>
+          <body>
+            <div class="cover-image">
+              <img src="#{@cover_image_fname}" alt="Cover"/>
+            </div>
+          </body>
+        </html>
+      XHTML
+    end
+    # Generates navigation XHTML content
+    def build_nav_page
+      <<~XHTML
+        <?xml version="1.0" encoding="utf-8"?>
+        <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en">
+          <head>
+            <title>Table of Contents</title>
+          </head>
+          <body>
+            <nav epub:type="toc" id="toc">
+              <h1>Table of Contents</h1>
+              <ol>
+                <li><a href="title.xhtml">Title Page</a></li>
+              </ol>
+            </nav>
+          </body>
+        </html>
+      XHTML
+    end
+  end
+end

data/lib/epub_tools.rb CHANGED Viewed

@@ -1,13 +1,18 @@
+# frozen_string_literal: true
 require_relative 'epub_tools/version'
 require_relative 'epub_tools/loggable'
 require_relative 'epub_tools/add_chapters'
 require_relative 'epub_tools/epub_initializer'
 require_relative 'epub_tools/split_chapters'
+require_relative 'epub_tools/chapter_marker_detector'
 require_relative 'epub_tools/xhtml_cleaner'
 require_relative 'epub_tools/xhtml_extractor'
 require_relative 'epub_tools/pack_ebook'
 require_relative 'epub_tools/unpack_ebook'
+require_relative 'epub_tools/book_builder'
 require_relative 'epub_tools/compile_book'
+require_relative 'epub_tools/append_book'
 require_relative 'epub_tools/cli'
 # Wrapper for all the other classes