RubyGems - repub - Versions diffs - 0.3.3 → 0.3.4 - Mend

repub 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/History.txt +11 -0
data/README.rdoc +14 -8
data/TODO +0 -2
data/lib/repub.rb +1 -1
data/lib/repub/app.rb +3 -0
data/lib/repub/app/builder.rb +151 -154
data/lib/repub/app/fetcher.rb +10 -23
data/lib/repub/app/filter.rb +30 -0
data/lib/repub/app/options.rb +0 -6
data/lib/repub/app/parser.rb +63 -73
data/lib/repub/app/post_filters.rb +135 -0
data/lib/repub/app/pre_filters.rb +50 -0
data/lib/repub/app/profile.rb +1 -1
data/lib/repub/epub.rb +4 -3
data/lib/repub/epub/container_item.rb +49 -0
data/lib/repub/epub/{toc.rb → ncx.rb} +137 -139
data/lib/repub/epub/ocf.rb +62 -0
data/lib/repub/epub/opf.rb +136 -0
data/repub.gemspec +4 -4
data/test/epub/{test_toc.rb → test_ncx.rb} +14 -12
data/test/epub/test_ocf.rb +28 -0
data/test/epub/{test_content.rb → test_opf.rb} +25 -19
data/test/test_filter.rb +28 -0
data/test/test_parser.rb +3 -4
metadata +17 -11
data/lib/repub/epub/container.rb +0 -28
data/lib/repub/epub/content.rb +0 -178
data/test/epub/test_container.rb +0 -15

data/lib/repub/app/fetcher.rb CHANGED

@@ -4,7 +4,7 @@ require 'uri'
 require 'iconv'
 require 'rubygems'
-# Temporary disable warnings from chardet
+# Disable warnings from chardet
 old_verbose = $VERBOSE
 $VERBOSE = false
 require 'UniversalDetector'
@@ -17,7 +17,7 @@ module Repub
       class FetcherException < RuntimeError; end
       def fetch
-        Fetcher.new(options).fetch
+        FetcherSupport.new(options).fetch
       end
       AssetTypes = {
@@ -26,7 +26,7 @@ module Repub
         :images => %w[jpg jpeg png gif svg]
       }
-      class Fetcher
+      class FetcherSupport
         include Logger
         Downloaders = {
@@ -63,34 +63,21 @@ module Repub
               raise FetcherException, "Fetch failed."
             end
             unless cache.cached?
-              fix_filenames(cache)
-              fix_encoding(cache, @options[:encoding])
+              preprocess cache
             end
           end
         end
         private
-        def fix_filenames(cache)
-          # TODO: fix non-alphanum characters in doc filenames
-        end
-        def fix_encoding(cache, encoding = nil)
-          cache.assets[:documents].each do |doc|
-            unless encoding
-              log.info "Detecting encoding for #{doc}"
-              s = IO.read(doc)
-              raise FetcherException, "empty document" unless s
-              encoding = UniversalDetector.chardet(s)['encoding']
-            end
-            if encoding.downcase != 'utf-8'
-              log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
-              s = Iconv.conv('utf-8', encoding, IO.read(doc))
-              File.open(doc, 'w') { |f| f.write(s) }
-            end
+        def preprocess(cache)
+          cache.assets[:documents].each do |file|
+            log.info "Preprocessing #{file}"
+            s = PreFilters.apply_filters(IO.read(file), @options)
+            File.open(file, 'w') { |f| f.write(s) }
           end
         end
         def which(cmd)
           if !RUBY_PLATFORM.match('mswin')
             cmd = `/usr/bin/which #{cmd}`.strip

data/lib/repub/app/filter.rb ADDED

@@ -0,0 +1,30 @@
+module Repub
+  class App
+    module Filter
+      def self.included(base)
+        (class << base; self; end).instance_eval do
+          define_method(:filter) do |name, &block|
+            @filters ||= []
+            @filters << {:name => name, :proc => Proc.new(&block) }
+          end
+          attr_reader :filters
+          attr_reader :options
+        end
+        base.extend(ClassMethods)
+        base.extend(Logger)
+      end
+      def options
+        self.class.options
+      end
+      module ClassMethods
+        def apply_filters(input, options = nil)
+          @options = options
+          @filters.inject(input) { |input, filter| filter[:proc].call(input) }
+        end
+      end
+    end
+  end
+end

data/lib/repub/app/options.rb CHANGED

@@ -17,7 +17,6 @@ module Repub
           :browser        => false,
           :css            => nil,
           :encoding       => nil,
-          :fixup          => true,
           :helper         => 'wget',
           :metadata       => {},
           :output_path    => Dir.getwd,
@@ -119,11 +118,6 @@ module Repub
             options[:metadata][name.to_sym] = value
           end
-          opts.on("-F", "--no-fixup",
-            "Do not attempt to make document meet XHTML 1.0 Strict.",
-            "Default is to try and fix things that are broken. "
-          ) { |value| options[:fixup] = false }
           opts.on("-e", "--encoding NAME", String,
             "Set source document encoding. Default is to autodetect."
           ) { |value| options[:encoding] = value }

data/lib/repub/app/parser.rb CHANGED

@@ -1,5 +1,6 @@
 require 'rubygems'
 require 'nokogiri'
+require 'repub/epub'
 module Repub
   class App
@@ -11,7 +12,7 @@ module Repub
         Parser.new(options).parse(cache)
       end
-      # Default selectors
+      # Default selectors, some reasonable values
       #
       Selectors = {
         :title        => '//h1',
@@ -26,37 +27,36 @@ module Repub
         attr_reader :cache
         attr_reader :uid
         attr_reader :title
-        attr_reader :title_html
         attr_reader :toc
         def initialize(options)
           @selectors = options[:selectors] || Selectors
-          @fixup = options[:fixup]
         end
+        # Parse downloaded asset cache
+        #
         def parse(cache)
           raise ParserException, "No HTML document found" if
             cache.assets[:documents].empty?
+          # TODO: limited to a single document only
           raise ParserException, "More than one HTML document found, this is not supported (yet)" if
             cache.assets[:documents].size > 1
           @cache = cache
-          @asset = @cache.assets[:documents][0]
-          log.debug "-- Parsing #{@asset}"
-          @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @asset)), nil, 'UTF-8')
+          @document = @cache.assets[:documents][0]
+          log.debug "-- Parsing #{@document}"
+          @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
           @uid = @cache.name
           parse_title
-          parse_title_html
           parse_toc
           self
         end
         private
-        UNTITLED = 'Untitled'
+        # Parse document title
+        #
         def parse_title
           log.debug "-- Looking for title with #{@selectors[:title]}"
           el = @doc.at(@selectors[:title])
@@ -69,82 +69,72 @@ module Repub
             @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
             log.info "Found title \"#{@title}\""
           else
-            @title = UNTITLED
+            @title = 'Untitled'
             log.warn "** Could not find document title, using '#{@title}'"
           end
         end
-        def parse_title_html
-          log.debug "-- Looking for html title with #{@selectors[:title]}"
-          el = @doc.at(@selectors[:title])
-          @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
-        end
-        # Helper container for TOC items
+        # Parsed TOC item container
+        # Inherit from NavPoint to avoid conversions later in Builder
         #
-        class TocItem < Struct.new(
-            :title,
-            :uri,
-            :fragment_id
-          )
+        class TocItem < Repub::Epub::NCX::NavPoint
-          def initialize(title, uri_with_fragment_id, subitems, asset)
-            self.title = title
-            self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
-            self.uri = asset if self.uri.empty?
-            @subitems = subitems || []
+          def initialize(title, uri_with_fragment_id, subitems, document)
+            uri, fragment_id = uri_with_fragment_id.split(/#/)
+            uri = document if uri.empty?
+            super(title, "#{uri}##{fragment_id}", subitems)
           end
-          attr_reader :subitems
-          def src
-            "#{uri}##{fragment_id}"
-          end
-        end
+        end
+        # Look for TOC and recursively parse it
+        #
         def parse_toc
+          @toc = []
+          depth = 0
+          l = lambda do |section|
+            toc_items = []
+            depth += 1
+            section.xpath(@selectors[:toc_item]).each do |item|
+              # Get item's anchor and href
+              a = item.name == 'a' ? item : item.at('a')
+              next if !a
+              href = a['href']
+              next if !href
+              # Is this a leaf item or node? Title parsing depends on that.
+              subsection = item.xpath(@selectors[:toc_section]).first
+              if subsection
+                # Item has subsection, use anchor text for title
+                title = a.inner_text
+              else
+                # Leaf item, it is safe to glue inner_text from all children
+                title = item.children.map{|c| c.inner_text }.join(' ')
+              end
+              title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
+              log.debug "-- #{"  " * depth}#{title}"
+              # Parse subsection
+              subitems = l.call(subsection) if subsection
+              toc_items << TocItem.new(title, href, subitems, @document)
+            end
+            depth -= 1
+            toc_items
+          end
           log.debug "-- Looking for TOC with #{@selectors[:toc]}"
-          el = @doc.xpath(@selectors[:toc]).first
-          if el
-            @toc = parse_toc_section(el)
+          toc_element = @doc.xpath(@selectors[:toc]).first
+          if toc_element
+            log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
+            @toc = l.call(toc_element)
             log.info "Found TOC with #{@toc.size} top-level items"
           else
-            @toc = []
             log.warn "** Could not find document table of contents"
           end
         end
-        def parse_toc_section(section)
-          toc = []
-          log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
-          section.xpath(@selectors[:toc_item]).each do |item|
-            # Get item's anchor and href
-            a = item.name == 'a' ? item : item.at('a')
-            next if !a
-            href = a['href']
-            next if !href
-            # Is this a leaf item or node ?
-            subsection = item.xpath(@selectors[:toc_section]).first
-            if subsection
-              # Item has subsection, use anchor text for title
-              title = a.inner_text
-            else
-              # Leaf item, glue inner_text from all children
-              title = item.children.map{|c| c.inner_text }.join(' ')
-            end
-            title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
-            log.debug "-- Found item: #{title}"
-            # Parse sub-section
-            if subsection
-              log.debug "-- Found section with #{@selectors[:toc_section]}"
-              log.debug "-- >"
-              subitems = parse_toc_section(subsection)
-              log.debug '-- .'
-            end
-            toc << TocItem.new(title, href, subitems, @asset)
-          end
-          toc
-        end
       end
     end

data/lib/repub/app/post_filters.rb ADDED

@@ -0,0 +1,135 @@
+require 'repub/app/filter'
+module Repub
+  class App
+    class PostFilters
+      class FileFilters
+        include Filter
+        # Do rx substitutions
+        #
+        filter :do_rxes do |s|
+          options[:rx].each do |rx|
+            rx.strip!
+            delimiter = rx[0, 1]
+            rx = rx.gsub(/\\#{delimiter}/, "\n")
+            ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
+            raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
+            pattern = ra[0]
+            replacement = ra[1] || ''
+            log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
+            s.gsub!(Regexp.new(pattern), replacement)
+          end if options[:rx]
+          s
+        end
+        # Remove xml preamble if any
+        #
+        filter :fix_xml_preamble do |s|
+          preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
+          if s =~ preamble_rx
+            log.debug "-- Removing xml preamble"
+            s.sub!(preamble_rx, '')
+          end
+          s
+        end
+        # Replace doctype
+        #
+        filter :fix_doctype do |s|
+          doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
+          if s =~ doctype_rx
+            s.sub!(doctype_rx, '')
+          end
+          log.debug "-- Replacing doctype"
+          s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
+          s
+        end
+      end
+      class DocumentFilters
+        include Filter
+        # Set Content-Type charset to UTF-8
+        #
+        filter :fix_content_type do |doc|
+          doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
+            el['content'] = 'text/html; charset=utf-8'
+          end
+          doc
+        end
+        # Process styles
+        #
+        filter :fix_styles do |doc|
+          if options[:css] && !options[:css].empty?
+            # Remove all stylesheet links
+            doc.xpath('//head/link[@rel="stylesheet"]').remove
+            if options[:css] == '-'
+              # Also remove all inline styles
+              doc.xpath('//head/style').remove
+              log.info "Removing all stylesheet links and style elements"
+            else
+              # Add custom stylesheet link
+              link = Nokogiri::XML::Node.new('link', doc)
+              link['rel'] = 'stylesheet'
+              link['type'] = 'text/css'
+              link['href'] = File.basename(@options[:css])
+              # Add as the last child so it has precedence over (possible) inline styles before
+              doc.at('//head').add_child(link)
+              log.info "Replacing CSS refs with \"#{link['href']}\""
+            end
+          end
+          doc
+        end
+        # Insert elements after/before selector
+        #
+        filter :do_inserts do |doc|
+          options[:after].each do |e|
+            selector = e.keys.first
+            fragment = e[selector]
+            element = doc.xpath(selector).first
+            if element
+              log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
+              fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
+            end
+          end if options[:after]
+          options[:before].each do |e|
+            selector = e.keys.first
+            fragment = e[selector]
+            element = doc.xpath(selector).first
+            if element
+              log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
+              fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
+            end
+          end if options[:before]
+          doc
+        end
+        # Remove elements
+        #
+        filter :do_removes do |doc|
+          options[:remove].each do |selector|
+            log.info "Removing elements \"#{selector}\""
+            doc.search(selector).remove
+          end if options[:remove]
+          doc
+        end
+        # TODO: XHTML requires a to have embedding element
+        # filter :wrap_anchors do |doc|
+        #   log.info "Wrapping anchors"
+        #   doc.xpath('//body/a').each do |a|
+        #     wrapper = Nokogiri::XML::Node.new('p', doc)
+        #     a.add_next_sibling(wrapper)
+        #     wrapper << a
+        #   end
+        #   doc
+        # end
+      end
+    end
+  end
+end