RubyGems - invisiblellama-repub - Versions diffs - 0.3.3 → 0.3.4 - Mend

invisiblellama-repub 0.3.3 → 0.3.4

Files changed (28) hide show

data/History.txt +11 -0
data/README.rdoc +14 -8
data/TODO +0 -2
data/lib/repub.rb +1 -1
data/lib/repub/app.rb +3 -0
data/lib/repub/app/builder.rb +151 -154
data/lib/repub/app/fetcher.rb +10 -23
data/lib/repub/app/filter.rb +30 -0
data/lib/repub/app/options.rb +0 -6
data/lib/repub/app/parser.rb +63 -73
data/lib/repub/app/post_filters.rb +135 -0
data/lib/repub/app/pre_filters.rb +50 -0
data/lib/repub/app/profile.rb +1 -1
data/lib/repub/epub.rb +4 -3
data/lib/repub/epub/container_item.rb +49 -0
data/lib/repub/epub/{toc.rb → ncx.rb} +137 -139
data/lib/repub/epub/ocf.rb +62 -0
data/lib/repub/epub/opf.rb +136 -0
data/repub.gemspec +4 -4
data/test/epub/{test_toc.rb → test_ncx.rb} +14 -12
data/test/epub/test_ocf.rb +28 -0
data/test/epub/{test_content.rb → test_opf.rb} +25 -19
data/test/test_filter.rb +28 -0
data/test/test_parser.rb +3 -4
metadata +17 -11
data/lib/repub/epub/container.rb +0 -28
data/lib/repub/epub/content.rb +0 -178
data/test/epub/test_container.rb +0 -15

data/lib/repub/app/fetcher.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'uri'
 require 'iconv'
 require 'rubygems'
-# Temporary disable warnings from chardet
+# Disable warnings from chardet
 old_verbose = $VERBOSE
 $VERBOSE = false
 require 'UniversalDetector'
@@ -17,7 +17,7 @@ module Repub
       class FetcherException < RuntimeError; end
       def fetch
-        Fetcher.new(options).fetch
+        FetcherSupport.new(options).fetch
       end
       AssetTypes = {
@@ -26,7 +26,7 @@ module Repub
         :images => %w[jpg jpeg png gif svg]
       }
-      class Fetcher
+      class FetcherSupport
         include Logger
         Downloaders = {
@@ -63,34 +63,21 @@ module Repub
               raise FetcherException, "Fetch failed."
             end
             unless cache.cached?
-              fix_filenames(cache)
-              fix_encoding(cache, @options[:encoding])
+              preprocess cache
             end
           end
         end
         private
-        def fix_filenames(cache)
-          # TODO: fix non-alphanum characters in doc filenames
-        end
-        def fix_encoding(cache, encoding = nil)
-          cache.assets[:documents].each do |doc|
-            unless encoding
-              log.info "Detecting encoding for #{doc}"
-              s = IO.read(doc)
-              raise FetcherException, "empty document" unless s
-              encoding = UniversalDetector.chardet(s)['encoding']
-            end
-            if encoding.downcase != 'utf-8'
-              log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
-              s = Iconv.conv('utf-8', encoding, IO.read(doc))
-              File.open(doc, 'w') { |f| f.write(s) }
-            end
+        def preprocess(cache)
+          cache.assets[:documents].each do |file|
+            log.info "Preprocessing #{file}"
+            s = PreFilters.apply_filters(IO.read(file), @options)
+            File.open(file, 'w') { |f| f.write(s) }
           end
         end
         def which(cmd)
           if !RUBY_PLATFORM.match('mswin')
             cmd = `/usr/bin/which #{cmd}`.strip

data/lib/repub/app/filter.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Repub
+  class App
+    module Filter
+      def self.included(base)
+        (class << base; self; end).instance_eval do
+          define_method(:filter) do |name, &block|
+            @filters ||= []
+            @filters << {:name => name, :proc => Proc.new(&block) }
+          end
+          attr_reader :filters
+          attr_reader :options
+        end
+        base.extend(ClassMethods)
+        base.extend(Logger)
+      end
+      def options
+        self.class.options
+      end
+      module ClassMethods
+        def apply_filters(input, options = nil)
+          @options = options
+          @filters.inject(input) { |input, filter| filter[:proc].call(input) }
+        end
+      end
+    end
+  end
+end

data/lib/repub/app/options.rb CHANGED Viewed

@@ -17,7 +17,6 @@ module Repub
           :browser        => false,
           :css            => nil,
           :encoding       => nil,
-          :fixup          => true,
           :helper         => 'wget',
           :metadata       => {},
           :output_path    => Dir.getwd,
@@ -119,11 +118,6 @@ module Repub
             options[:metadata][name.to_sym] = value
           end
-          opts.on("-F", "--no-fixup",
-            "Do not attempt to make document meet XHTML 1.0 Strict.",
-            "Default is to try and fix things that are broken. "
-          ) { |value| options[:fixup] = false }
           opts.on("-e", "--encoding NAME", String,
             "Set source document encoding. Default is to autodetect."
           ) { |value| options[:encoding] = value }

data/lib/repub/app/parser.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'rubygems'
 require 'nokogiri'
+require 'repub/epub'
 module Repub
   class App
@@ -11,7 +12,7 @@ module Repub
         Parser.new(options).parse(cache)
       end
-      # Default selectors
+      # Default selectors, some reasonable values
       #
       Selectors = {
         :title        => '//h1',
@@ -26,37 +27,36 @@ module Repub
         attr_reader :cache
         attr_reader :uid
         attr_reader :title
-        attr_reader :title_html
         attr_reader :toc
         def initialize(options)
           @selectors = options[:selectors] || Selectors
-          @fixup = options[:fixup]
         end
+        # Parse downloaded asset cache
+        #
         def parse(cache)
           raise ParserException, "No HTML document found" if
             cache.assets[:documents].empty?
+          # TODO: limited to a single document only
           raise ParserException, "More than one HTML document found, this is not supported (yet)" if
             cache.assets[:documents].size > 1
           @cache = cache
-          @asset = @cache.assets[:documents][0]
-          log.debug "-- Parsing #{@asset}"
-          @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @asset)), nil, 'UTF-8')
+          @document = @cache.assets[:documents][0]
+          log.debug "-- Parsing #{@document}"
+          @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
           @uid = @cache.name
           parse_title
-          parse_title_html
           parse_toc
           self
         end
         private
-        UNTITLED = 'Untitled'
+        # Parse document title
+        #
         def parse_title
           log.debug "-- Looking for title with #{@selectors[:title]}"
           el = @doc.at(@selectors[:title])
@@ -69,82 +69,72 @@ module Repub
             @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
             log.info "Found title \"#{@title}\""
           else
-            @title = UNTITLED
+            @title = 'Untitled'
             log.warn "** Could not find document title, using '#{@title}'"
           end
         end
-        def parse_title_html
-          log.debug "-- Looking for html title with #{@selectors[:title]}"
-          el = @doc.at(@selectors[:title])
-          @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
-        end
-        # Helper container for TOC items
+        # Parsed TOC item container
+        # Inherit from NavPoint to avoid conversions later in Builder
         #
-        class TocItem < Struct.new(
-            :title,
-            :uri,
-            :fragment_id
-          )
+        class TocItem < Repub::Epub::NCX::NavPoint
-          def initialize(title, uri_with_fragment_id, subitems, asset)
-            self.title = title
-            self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
-            self.uri = asset if self.uri.empty?
-            @subitems = subitems || []
+          def initialize(title, uri_with_fragment_id, subitems, document)
+            uri, fragment_id = uri_with_fragment_id.split(/#/)
+            uri = document if uri.empty?
+            super(title, "#{uri}##{fragment_id}", subitems)
           end
-          attr_reader :subitems
-          def src
-            "#{uri}##{fragment_id}"
-          end
-        end
+        end
+        # Look for TOC and recursively parse it
+        #
         def parse_toc
+          @toc = []
+          depth = 0
+          l = lambda do |section|
+            toc_items = []
+            depth += 1
+            section.xpath(@selectors[:toc_item]).each do |item|
+              # Get item's anchor and href
+              a = item.name == 'a' ? item : item.at('a')
+              next if !a
+              href = a['href']
+              next if !href
+              # Is this a leaf item or node? Title parsing depends on that.
+              subsection = item.xpath(@selectors[:toc_section]).first
+              if subsection
+                # Item has subsection, use anchor text for title
+                title = a.inner_text
+              else
+                # Leaf item, it is safe to glue inner_text from all children
+                title = item.children.map{|c| c.inner_text }.join(' ')
+              end
+              title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
+              log.debug "-- #{"  " * depth}#{title}"
+              # Parse subsection
+              subitems = l.call(subsection) if subsection
+              toc_items << TocItem.new(title, href, subitems, @document)
+            end
+            depth -= 1
+            toc_items
+          end
           log.debug "-- Looking for TOC with #{@selectors[:toc]}"
-          el = @doc.xpath(@selectors[:toc]).first
-          if el
-            @toc = parse_toc_section(el)
+          toc_element = @doc.xpath(@selectors[:toc]).first
+          if toc_element
+            log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
+            @toc = l.call(toc_element)
             log.info "Found TOC with #{@toc.size} top-level items"
           else
-            @toc = []
             log.warn "** Could not find document table of contents"
           end
         end
-        def parse_toc_section(section)
-          toc = []
-          log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
-          section.xpath(@selectors[:toc_item]).each do |item|
-            # Get item's anchor and href
-            a = item.name == 'a' ? item : item.at('a')
-            next if !a
-            href = a['href']
-            next if !href
-            # Is this a leaf item or node ?
-            subsection = item.xpath(@selectors[:toc_section]).first
-            if subsection
-              # Item has subsection, use anchor text for title
-              title = a.inner_text
-            else
-              # Leaf item, glue inner_text from all children
-              title = item.children.map{|c| c.inner_text }.join(' ')
-            end
-            title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
-            log.debug "-- Found item: #{title}"
-            # Parse sub-section
-            if subsection
-              log.debug "-- Found section with #{@selectors[:toc_section]}"
-              log.debug "-- >"
-              subitems = parse_toc_section(subsection)
-              log.debug '-- .'
-            end
-            toc << TocItem.new(title, href, subitems, @asset)
-          end
-          toc
-        end
       end
     end

data/lib/repub/app/post_filters.rb ADDED Viewed

@@ -0,0 +1,135 @@
+require 'repub/app/filter'
+module Repub
+  class App
+    class PostFilters
+      class FileFilters
+        include Filter
+        # Do rx substitutions
+        #
+        filter :do_rxes do |s|
+          options[:rx].each do |rx|
+            rx.strip!
+            delimiter = rx[0, 1]
+            rx = rx.gsub(/\\#{delimiter}/, "\n")
+            ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
+            raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
+            pattern = ra[0]
+            replacement = ra[1] || ''
+            log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
+            s.gsub!(Regexp.new(pattern), replacement)
+          end if options[:rx]
+          s
+        end
+        # Remove xml preamble if any
+        #
+        filter :fix_xml_preamble do |s|
+          preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
+          if s =~ preamble_rx
+            log.debug "-- Removing xml preamble"
+            s.sub!(preamble_rx, '')
+          end
+          s
+        end
+        # Replace doctype
+        #
+        filter :fix_doctype do |s|
+          doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
+          if s =~ doctype_rx
+            s.sub!(doctype_rx, '')
+          end
+          log.debug "-- Replacing doctype"
+          s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
+          s
+        end
+      end
+      class DocumentFilters
+        include Filter
+        # Set Content-Type charset to UTF-8
+        #
+        filter :fix_content_type do |doc|
+          doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
+            el['content'] = 'text/html; charset=utf-8'
+          end
+          doc
+        end
+        # Process styles
+        #
+        filter :fix_styles do |doc|
+          if options[:css] && !options[:css].empty?
+            # Remove all stylesheet links
+            doc.xpath('//head/link[@rel="stylesheet"]').remove
+            if options[:css] == '-'
+              # Also remove all inline styles
+              doc.xpath('//head/style').remove
+              log.info "Removing all stylesheet links and style elements"
+            else
+              # Add custom stylesheet link
+              link = Nokogiri::XML::Node.new('link', doc)
+              link['rel'] = 'stylesheet'
+              link['type'] = 'text/css'
+              link['href'] = File.basename(@options[:css])
+              # Add as the last child so it has precedence over (possible) inline styles before
+              doc.at('//head').add_child(link)
+              log.info "Replacing CSS refs with \"#{link['href']}\""
+            end
+          end
+          doc
+        end
+        # Insert elements after/before selector
+        #
+        filter :do_inserts do |doc|
+          options[:after].each do |e|
+            selector = e.keys.first
+            fragment = e[selector]
+            element = doc.xpath(selector).first
+            if element
+              log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
+              fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
+            end
+          end if options[:after]
+          options[:before].each do |e|
+            selector = e.keys.first
+            fragment = e[selector]
+            element = doc.xpath(selector).first
+            if element
+              log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
+              fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
+            end
+          end if options[:before]
+          doc
+        end
+        # Remove elements
+        #
+        filter :do_removes do |doc|
+          options[:remove].each do |selector|
+            log.info "Removing elements \"#{selector}\""
+            doc.search(selector).remove
+          end if options[:remove]
+          doc
+        end
+        # TODO: XHTML requires a to have embedding element
+        # filter :wrap_anchors do |doc|
+        #   log.info "Wrapping anchors"
+        #   doc.xpath('//body/a').each do |a|
+        #     wrapper = Nokogiri::XML::Node.new('p', doc)
+        #     a.add_next_sibling(wrapper)
+        #     wrapper << a
+        #   end
+        #   doc
+        # end
+      end
+    end
+  end
+end