RubyGems - repub - Versions diffs - 0.3.0 - Mend

repub 0.3.0

Files changed (41) hide show

data/lib/repub/app/fetcher.rb ADDED Viewed

@@ -0,0 +1,164 @@
+require 'fileutils'
+require 'digest/sha1'
+require 'uri'
+require 'iconv'
+require 'rubygems'
+old_verbose = $VERBOSE
+$VERBOSE = false
+require 'UniversalDetector'
+$VERBOSE = old_verbose
+module Repub
+  class App
+    module Fetcher
+      class FetcherException < RuntimeError; end
+      def fetch
+        Fetcher.new(options).fetch
+      end
+      AssetTypes = {
+        :documents => %w[html htm],
+        :stylesheets => %w[css],
+        :images => %w[jpg jpeg png gif svg]
+      }
+      class Fetcher
+        include Logger
+        Downloaders = {
+          :wget     => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
+          :httrack  => { :cmd => 'httrack', :options => '-gB -r2 +*.css +*.jpg -*.xml -*.html' }
+        }
+        def initialize(options)
+          @options = options
+          @downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
+          begin
+            downloader = Downloaders[@options[:helper].to_sym] rescue Downloaders[:wget]
+            log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
+            @downloader_path ||= which(downloader[:cmd])
+            @downloader_options ||= downloader[:options]
+          rescue RuntimeError
+            raise FetcherException, "unknown helper '#{@options[:helper]}'"
+          end
+        end
+        def fetch
+          url = @options[:url]
+          raise FetcherException, "empty URL" if !url || url.empty?
+          begin
+            URI.parse(url)
+          rescue
+            raise FetcherException, "invalid URL: #{url}"
+          end
+          cmd = "#{@downloader_path} #{@downloader_options} #{url}"
+          Cache.for_url(url) do |cache|
+            log.debug "-- Downloading into #{cache.path}"
+            unless system(cmd) && !cache.empty?
+              raise FetcherException, "Fetch failed."
+            end
+          end
+        end
+        private
+        def which(cmd)
+          if !RUBY_PLATFORM.match('mswin')
+            cmd = `/usr/bin/which #{cmd}`.strip
+            raise FetcherException, "#{cmd}: helper not found." if cmd.empty?
+          end
+          cmd
+        end
+      end
+      class Cache
+        include Logger
+        def self.root
+          return File.join(App.data_path, 'cache')
+        end
+        def self.inventorize
+          # TODO
+        end
+        def self.cleanup
+          Dir.chdir(self.root) { FileUtils.rm_r(Dir.glob('*')) }
+        rescue
+          # ignore exceptions
+        end
+        attr_reader :url
+        attr_reader :name
+        attr_reader :path
+        attr_reader :assets
+        def self.for_url(url, &block)
+          self.new(url).for_url(&block)
+        end
+        def for_url(&block)
+          # Download stuff if not yet cached
+          cached = File.exist?(@path)
+          unless cached
+            FileUtils.mkdir_p(@path)
+            begin
+              Dir.chdir(@path) { yield self }
+            rescue
+              FileUtils.rm_r(@path)
+              raise
+            end
+          else
+            log.info "Using cached assets"
+            log.debug "-- Cache is #{@path}"
+          end
+          # Do post-download tasks
+          Dir.chdir(@path) do
+            # Enumerate assets
+            @assets = {}
+            AssetTypes.each_pair do |asset_type, file_types|
+              @assets[asset_type] ||= []
+              file_types.each do |file_type|
+                @assets[asset_type] << Dir.glob("*.#{file_type}")
+              end
+              @assets[asset_type].flatten!
+            end
+            # For freshly downloaded docs, detect encoding and convert to utf-8
+            unless cached
+              @assets[:documents].each do |doc|
+                log.info "Detecting encoding for #{doc}"
+                s = IO.read(doc)
+                raise FetcherException, "empty document" unless s
+                encoding = UniversalDetector.chardet(s)['encoding']
+                if encoding.downcase != 'utf-8'
+                  log.info "Looks like #{encoding}, converting to UTF-8"
+                  s = Iconv.conv('utf-8', encoding, IO.read(doc))
+                  File.open(doc, 'w') { |f| f.write(s) }
+                else
+                  log.info "Looks like UTF-8, no conversion needed"
+                end
+              end
+            end
+          end
+          self
+        end
+        def empty?
+          Dir.glob(File.join(@path, '*')).empty?
+        end
+        private
+        def initialize(url)
+          @url = url
+          @name = Digest::SHA1.hexdigest(@url)
+          @path = File.join(Cache.root, @name)
+        end
+      end
+    end
+  end
+end

data/lib/repub/app/logger.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require 'singleton'
+module Repub
+  class App
+    module Logger
+      # Logging verbosity
+      #
+      LOGGER_QUIET = 0      # nothing except errors
+      LOGGER_NORMAL = 1     # info and above
+      LOGGER_VERBOSE = 2    # everything, including debuging noise
+      def log
+        Logger.instance
+      end
+      class Logger
+        include Singleton
+        attr_accessor :level
+        attr_accessor :stdout
+        attr_accessor :stderr
+        def debug(msg)
+          @stdout.puts(msg) if @level >= LOGGER_VERBOSE
+        end
+        def info(msg)
+          @stdout.puts(msg) if @level >= LOGGER_NORMAL
+        end
+        def error(msg)
+          @stderr.puts(msg) if @level >= LOGGER_QUIET
+        end
+        alias_method :warn, :error
+        def fatal(msg)
+          error(msg)
+          exit 1
+        end
+        private
+        def initialize
+          @level = LOGGER_NORMAL
+          @stdout = STDOUT
+          @stderr = STDERR
+        end
+      end
+    end
+  end
+end

data/lib/repub/app/options.rb ADDED Viewed

@@ -0,0 +1,180 @@
+require 'optparse'
+module Repub
+  class App
+    module Options
+      include Logger
+      attr_reader :options
+      def parse_options(args)
+        # Default options
+        @options = {
+          :browser        => false,
+          :css            => nil,
+          :encoding       => nil,
+          :fixup          => true,
+          :helper         => 'wget',
+          :metadata       => {},
+          :output_path    => Dir.getwd,
+          :profile        => 'default',
+          :remove         => [],
+          :rx             => [],
+          :selectors      => Parser::Selectors,
+          :url            => nil,
+          :verbosity      => Repub::App::Logger::LOGGER_NORMAL,
+        }
+        # Load default profile
+        if load_profile(options[:profile]).empty?
+          write_profile(options[:profile])
+        end
+        # Parse command line
+        parser = OptionParser.new do |opts|
+          opts.banner = <<-BANNER.gsub(/^          /,'')
+            Repub is a simple HTML to ePub converter.
+            Usage: #{App.name} [options] url
+            General options:
+          BANNER
+          opts.on("-D", "--downloader NAME ", ['wget', 'httrack'],
+              "Which downloader to use to get files (wget or httrack).",
+            "Default is #{options[:helper]}."
+          ) { |value| options[:helper] = value }
+          opts.on("-o", "--output PATH", String,
+            "Output path for generated ePub file.",
+            "Default is #{options[:output_path]}/<Parsed_Title>.epub"
+          ) { |value| options[:output_path] = File.expand_path(value) }
+          opts.on("-w", "--write-profile NAME", String,
+            "Save given options for later reuse as profile NAME."
+          ) { |value| options[:profile] = value; write_profile(value) }
+          opts.on("-l", "--load-profile NAME", String,
+            "Load options from saved profile NAME."
+          ) { |value| options[:profile] = value; load_profile(value) }
+          opts.on("-W", "--write-default",
+            "Save given options for later reuse as default profile."
+          ) { write_profile }
+          opts.on("-L", "--list-profiles",
+            "List saved profiles."
+          ) { list_profiles; exit 1 }
+          opts.on("-C", "--cleanup",
+            "Clean up download cache."
+          ) { Fetcher::Cache.cleanup; exit 1 }
+          opts.on("-v", "--verbose",
+            "Turn on verbose output."
+          ) { options[:verbosity] = Repub::App::Logger::LOGGER_VERBOSE }
+          opts.on("-q", "--quiet",
+            "Turn off any output except errors."
+          ) { options[:verbosity] = Repub::App::Logger::LOGGER_QUIET }
+          opts.on("-V", "--version",
+            "Show version."
+          ) { puts Repub.version; exit 1 }
+          opts.on("-h", "--help",
+            "Show this help message."
+          ) { help opts; exit 1 }
+          opts.separator ""
+          opts.separator "  Parser options:"
+          opts.on("-x", "--selector NAME:VALUE", String,
+            "Set parser XPath selector NAME to VALUE.",
+            "Recognized selectors are: [title toc toc_item toc_section]"
+          ) do |value|
+            begin
+              name, value = value.match(/([^:]+):(.*)/)[1, 2]
+            rescue
+              log.fatal "ERROR: invalid argument: -x '#{value}'. See '#{App.name} --help'."
+            end
+            options[:selectors][name.to_sym] = value
+          end
+          opts.on("-m", "--meta NAME:VALUE", String,
+            "Set publication information metadata NAME to VALUE.",
+            "Valid metadata names are: [creator date description",
+            "language publisher relation rights subject title]"
+          ) do |value|
+            begin
+              name, value = value.match(/([^:]+):(.*)/)[1, 2]
+            rescue
+              log.fatal "ERROR: invalid argument: -m '#{value}'. See '#{App.name} --help'."
+            end
+            options[:metadata][name.to_sym] = value
+          end
+          opts.on("-F", "--no-fixup",
+            "Do not attempt to make document meet XHTML 1.0 Strict.",
+            "Default is to try and fix things that are broken. "
+          ) { |value| options[:fixup] = false }
+          opts.on("-e", "--encoding NAME", String,
+            "Set source document encoding. Default is to autodetect."
+          ) { |value| options[:encoding] = value }
+          opts.separator ""
+          opts.separator "  Post-processing options:"
+          opts.on("-s", "--stylesheet PATH", String,
+            "Use custom stylesheet at PATH to add or override existing",
+            "CSS references in the source document."
+          ) { |value| options[:css] = File.expand_path(value) }
+          opts.on("-X", "--remove SELECTOR", String,
+            "Remove source element using XPath selector.",
+            "Use -X- to ignore stored profile."
+          ) { |value| value == '-' ? options[:remove] = [] : options[:remove] << value }
+          opts.on("-R", "--rx /PATTERN/REPLACEMENT/", String,
+            "Edit source HTML using regular expressions.",
+            "Use -R- to ignore stored profile."
+          ) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
+          opts.on("-B", "--browse",
+            "After processing, open resulting HTML in default browser."
+          ) { |value| options[:browser] = true }
+        end
+        if args.empty?
+          help parser
+          exit 1
+        end
+        begin
+          parser.parse! args
+        rescue OptionParser::ParseError => ex
+          log.fatal "ERROR: #{ex.to_s}. See '#{App.name} --help'."
+        end
+        options[:url] = args.last
+        if options[:url].nil? || options[:url].empty?
+          help parser
+          log.fatal "ERROR: Please specify an URL."
+        end
+      end
+      def help(opts)
+        puts opts
+        puts
+        puts "  Current profile (#{options[:profile]}):"
+        dump_profile(options[:profile])
+        puts
+      end
+    end
+  end
+end

data/lib/repub/app/parser.rb ADDED Viewed

@@ -0,0 +1,152 @@
+require 'rubygems'
+require 'nokogiri'
+module Repub
+  class App
+    module Parser
+      class ParserException < RuntimeError; end
+      def parse(cache)
+        Parser.new(options).parse(cache)
+      end
+      # Default selectors
+      #
+      Selectors = {
+        :title        => '//h1',
+        :toc          => '//ul',
+        :toc_item     => './li',
+        :toc_section  => './ul'
+      }
+      class Parser
+        include Logger
+        attr_reader :cache
+        attr_reader :uid
+        attr_reader :title
+        attr_reader :title_html
+        attr_reader :toc
+        def initialize(options)
+          @selectors = options[:selectors] || Selectors
+          @fixup = options[:fixup]
+        end
+        def parse(cache)
+          raise ParserException, "No HTML document found" if
+            cache.assets[:documents].empty?
+          raise ParserException, "More than one HTML document found, this is not supported (yet)" if
+            cache.assets[:documents].size > 1
+          @cache = cache
+          @asset = @cache.assets[:documents][0]
+          log.debug "-- Parsing #{@asset}"
+          @doc = Nokogiri::HTML.parse(open(File.join(@cache.path, @asset)), nil, 'UTF-8')
+          @uid = @cache.name
+          parse_title
+          parse_title_html
+          parse_toc
+          self
+        end
+        private
+        UNTITLED = 'Untitled'
+        def parse_title
+          log.debug "-- Looking for title with #{@selectors[:title]}"
+          el = @doc.at(@selectors[:title])
+          if el
+            if el.children.empty?
+              title_text = el.inner_text
+            else
+              title_text = el.children.map{|c| c.inner_text }.join(' ')
+            end
+            @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
+            log.info "Found title \"#{@title}\""
+          else
+            @title = UNTITLED
+            log.warn "** Could not find document title, using '#{@title}'"
+          end
+        end
+        def parse_title_html
+          log.debug "-- Looking for html title with #{@selectors[:title]}"
+          el = @doc.at(@selectors[:title])
+          @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
+        end
+        # Helper container for TOC items
+        #
+        class TocItem < Struct.new(
+            :title,
+            :uri,
+            :fragment_id
+          )
+          def initialize(title, uri_with_fragment_id, subitems, asset)
+            self.title = title
+            self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
+            self.uri = asset if self.uri.empty?
+            @subitems = subitems || []
+          end
+          attr_reader :subitems
+          def src
+            "#{uri}##{fragment_id}"
+          end
+        end
+        def parse_toc
+          log.debug "-- Looking for TOC with #{@selectors[:toc]}"
+          el = @doc.xpath(@selectors[:toc]).first
+          if el
+            @toc = parse_toc_section(el)
+            log.info "Found TOC with #{@toc.size} top-level items"
+          else
+            @toc = []
+            log.warn "** Could not find document table of contents"
+          end
+        end
+        def parse_toc_section(section)
+          toc = []
+          log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
+          section.xpath(@selectors[:toc_item]).each do |item|
+            # Get item's anchor and href
+            a = item.name == 'a' ? item : item.at('a')
+            next if !a
+            href = a[:href]
+            next if !href
+            # Is this a leaf item or node ?
+            subsection = item.xpath(@selectors[:toc_section]).first
+            if subsection
+              # Item has subsection, use anchor text for title
+              title = a.inner_text
+            else
+              # Leaf item, glue inner_text from all children
+              title = item.children.map{|c| c.inner_text }.join(' ')
+            end
+            title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
+            log.debug "-- Found item: #{title}"
+            # Parse sub-section
+            if subsection
+              log.debug "-- Found section with #{@selectors[:toc_section]}"
+              log.debug "-- >"
+              subitems = parse_toc_section(subsection)
+              log.debug '-- .'
+            end
+            toc << TocItem.new(title, href, subitems, @asset)
+          end
+          toc
+        end
+      end
+    end
+  end
+end