invisiblellama-repub 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,162 @@
1
+ require 'fileutils'
2
+ require 'digest/sha1'
3
+ require 'uri'
4
+ require 'iconv'
5
+ require 'rubygems'
6
+
7
+ # XXX: suppress warnings from chardet (until they fix them)
8
+ $VERBOSE=false
9
+ require 'UniversalDetector'
10
+ $VERBOSE=true
11
+
12
+ module Repub
13
+ class App
14
+ module Fetcher
15
+
16
+ class FetcherException < RuntimeError; end
17
+
18
+ def fetch
19
+ Fetcher.new(options).fetch
20
+ end
21
+
22
+ AssetTypes = {
23
+ :documents => %w[html htm],
24
+ :stylesheets => %w[css],
25
+ :images => %w[jpg jpeg png gif svg]
26
+ }
27
+
28
+ class Fetcher
29
+ include Logger
30
+
31
+ Downloaders = {
32
+ :wget => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
33
+ :httrack => { :cmd => 'httrack', :options => '-gB -r2 +*.css +*.jpg -*.xml -*.html' }
34
+ }
35
+
36
+ def initialize(options)
37
+ @options = options
38
+ @downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
39
+ begin
40
+ downloader = Downloaders[@options[:helper].to_sym] rescue Downloaders[:wget]
41
+ log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
42
+ @downloader_path ||= which(downloader[:cmd])
43
+ @downloader_options ||= downloader[:options]
44
+ rescue RuntimeError
45
+ raise FetcherException, "unknown helper '#{@options[:helper]}'"
46
+ end
47
+ end
48
+
49
+ def fetch
50
+ url = @options[:url]
51
+ raise FetcherException, "empty URL" if !url || url.empty?
52
+ begin
53
+ URI.parse(url)
54
+ rescue
55
+ raise FetcherException, "invalid URL: #{url}"
56
+ end
57
+ cmd = "#{@downloader_path} #{@downloader_options} #{url}"
58
+ Cache.for_url(url) do |cache|
59
+ log.debug "-- Downloading into #{cache.path}"
60
+ unless system(cmd) && !cache.empty?
61
+ raise FetcherException, "Fetch failed."
62
+ end
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ def which(cmd)
69
+ if !RUBY_PLATFORM.match('mswin')
70
+ cmd = `/usr/bin/which #{cmd}`.strip
71
+ raise FetcherException, "#{cmd}: helper not found." if cmd.empty?
72
+ end
73
+ cmd
74
+ end
75
+ end
76
+
77
+ class Cache
78
+ include Logger
79
+
80
+ def self.root
81
+ return File.join(App.data_path, 'cache')
82
+ end
83
+
84
+ def self.inventorize
85
+ # TODO
86
+ end
87
+
88
+ def self.cleanup
89
+ Dir.chdir(self.root) { FileUtils.rm_r(Dir.glob('*')) }
90
+ rescue
91
+ # ignore exceptions
92
+ end
93
+
94
+ attr_reader :url
95
+ attr_reader :name
96
+ attr_reader :path
97
+ attr_reader :assets
98
+
99
+ def self.for_url(url, &block)
100
+ self.new(url).for_url(&block)
101
+ end
102
+
103
+ def for_url(&block)
104
+ # if not yet cached, download stuff
105
+ unless File.exist?(@path)
106
+ FileUtils.mkdir_p(@path)
107
+ begin
108
+ Dir.chdir(@path) { yield self }
109
+ rescue
110
+ FileUtils.rm_r(@path)
111
+ raise
112
+ end
113
+ else
114
+ log.debug "-- Already cached in #{@path}"
115
+ end
116
+ # do post-download tasks
117
+ if File.exist?(@path)
118
+ Dir.chdir(@path) do
119
+ # enumerate assets
120
+ @assets = {}
121
+ AssetTypes.each_pair do |asset_type, file_types|
122
+ @assets[asset_type] ||= []
123
+ file_types.each do |file_type|
124
+ @assets[asset_type] << Dir.glob("*.#{file_type}")
125
+ end
126
+ @assets[asset_type].flatten!
127
+ end
128
+ # detect encoding and convert to utf-8 if needed
129
+ @assets[:documents].each do |doc|
130
+ log.debug "-- Detecting encoding for #{doc}"
131
+ s = IO.read(doc)
132
+ raise FetcherException, "empty document" unless s
133
+ encoding = UniversalDetector::chardet(s)['encoding']
134
+ if encoding.downcase != 'utf-8'
135
+ log.debug "-- Looks like it's #{encoding}, will convert to UTF-8"
136
+ s = Iconv.conv('utf-8', encoding, s)
137
+ File.open(doc, 'w') { |f| f.write(s) }
138
+ else
139
+ log.debug "-- Looks like it's UTF-8, no conversion needed"
140
+ end
141
+ end
142
+ end
143
+ end
144
+ self
145
+ end
146
+
147
+ def empty?
148
+ Dir.glob(File.join(@path, '*')).empty?
149
+ end
150
+
151
+ private
152
+
153
+ def initialize(url)
154
+ @url = url
155
+ @name = Digest::SHA1.hexdigest(@url)
156
+ @path = File.join(Cache.root, @name)
157
+ end
158
+ end
159
+
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,52 @@
1
+ require 'singleton'
2
+
3
+ module Repub
4
+ class App
5
+ module Logger
6
+
7
+ # Logging verbosity
8
+ #
9
+ LOGGER_QUIET = 0 # nothing except errors
10
+ LOGGER_NORMAL = 1 # info and above
11
+ LOGGER_VERBOSE = 2 # everything, including debuging noise
12
+
13
+ def log
14
+ Logger.instance
15
+ end
16
+
17
+ class Logger
18
+ include Singleton
19
+
20
+ attr_accessor :level
21
+ attr_accessor :stdout
22
+ attr_accessor :stderr
23
+
24
+ def debug(msg)
25
+ @stdout.puts(msg) if @level >= LOGGER_VERBOSE
26
+ end
27
+
28
+ def info(msg)
29
+ @stdout.puts(msg) if @level >= LOGGER_NORMAL
30
+ end
31
+
32
+ def error(msg)
33
+ @stderr.puts(msg) if @level >= LOGGER_QUIET
34
+ end
35
+ alias_method :warn, :error
36
+
37
+ def fatal(msg)
38
+ error(msg)
39
+ exit 1
40
+ end
41
+
42
+ private
43
+ def initialize
44
+ @level = LOGGER_NORMAL
45
+ @stdout = STDOUT
46
+ @stderr = STDERR
47
+ end
48
+ end
49
+
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,173 @@
1
+ require 'optparse'
2
+
3
+ module Repub
4
+ class App
5
+ module Options
6
+
7
+ attr_reader :options
8
+
9
+ def parse_options(args)
10
+
11
+ # Default options
12
+ @options = {
13
+ :browser => false,
14
+ :css => nil,
15
+ :encoding => nil,
16
+ :fixup => true,
17
+ :helper => 'wget',
18
+ :metadata => {},
19
+ :output_path => Dir.getwd,
20
+ :profile => 'default',
21
+ :remove => [],
22
+ :rx => [],
23
+ :selectors => Parser::Selectors,
24
+ :url => nil,
25
+ :verbosity => Repub::App::Logger::LOGGER_NORMAL,
26
+ }
27
+
28
+ # Load default profile
29
+ if load_profile(options[:profile]).empty?
30
+ write_profile(options[:profile])
31
+ end
32
+
33
+ # Parse command line
34
+ parser = OptionParser.new do |opts|
35
+ opts.banner = <<-BANNER.gsub(/^ /,'')
36
+
37
+ Repub is a simple HTML to ePub converter.
38
+
39
+ Usage: #{App.name} [options] url
40
+
41
+ General options:
42
+ BANNER
43
+
44
+ opts.on("-D", "--downloader NAME ", ['wget', 'httrack'],
45
+ "Which downloader to use to get files (wget or httrack).",
46
+ "Default is #{options[:helper]}."
47
+ ) { |value| options[:helper] = value }
48
+
49
+ opts.on("-o", "--output PATH", String,
50
+ "Output path for generated ePub file.",
51
+ "Default is #{options[:output_path]}/<Parsed_Title>.epub"
52
+ ) { |value| options[:output_path] = File.expand_path(value) }
53
+
54
+ opts.on("-w", "--write-profile NAME", String,
55
+ "Save given options for later reuse as profile NAME."
56
+ ) { |value| options[:profile] = value; write_profile(value) }
57
+
58
+ opts.on("-l", "--load-profile NAME", String,
59
+ "Load options from saved profile NAME."
60
+ ) { |value| options[:profile] = value; load_profile(value) }
61
+
62
+ opts.on("-W", "--write-default",
63
+ "Save given options for later reuse as default profile."
64
+ ) { write_profile }
65
+
66
+ opts.on("-L", "--list-profiles",
67
+ "List saved profiles."
68
+ ) { list_profiles; exit 1 }
69
+
70
+ opts.on("-C", "--cleanup",
71
+ "Clean up download cache."
72
+ ) { Fetcher::Cache.cleanup; exit 1 }
73
+
74
+ opts.on("-v", "--verbose",
75
+ "Turn on verbose output."
76
+ ) { options[:verbosity] = Repub::App::Logger::LOGGER_VERBOSE }
77
+
78
+ opts.on("-q", "--quiet",
79
+ "Turn off any output except errors."
80
+ ) { options[:verbosity] = Repub::App::Logger::LOGGER_QUIET }
81
+
82
+ opts.on("-V", "--version",
83
+ "Show version."
84
+ ) { puts Repub.version; exit 1 }
85
+
86
+ opts.on("-h", "--help",
87
+ "Show this help message."
88
+ ) { help opts; exit 1 }
89
+
90
+ opts.separator ""
91
+ opts.separator " Parser options:"
92
+
93
+ opts.on("-x", "--selector NAME:VALUE", String,
94
+ "Set parser XPath or CSS selector NAME to VALUE.",
95
+ "Recognized selectors are: [title toc toc_item toc_section]"
96
+ ) do |value|
97
+ name, value = value.split(/:/)
98
+ options[:selectors][name.to_sym] = value
99
+ end
100
+
101
+ opts.on("-m", "--meta NAME:VALUE", String,
102
+ "Set publication information metadata NAME to VALUE.",
103
+ "Valid metadata names are: [creator date description",
104
+ "language publisher relation rights subject title]"
105
+ ) do |value|
106
+ name, value = value.split(/:/)
107
+ options[:metadata][name.to_sym] = value
108
+ end
109
+
110
+ opts.on("-F", "--no-fixup",
111
+ "Do not attempt to make document meet XHTML 1.0 Strict.",
112
+ "Default is to try and fix things that are broken. "
113
+ ) { |value| options[:fixup] = false }
114
+
115
+ opts.on("-e", "--encoding NAME", String,
116
+ "Set source document encoding. Default is to autodetect."
117
+ ) { |value| options[:encoding] = value }
118
+
119
+ opts.separator ""
120
+ opts.separator " Post-processing options:"
121
+
122
+ opts.on("-s", "--stylesheet PATH", String,
123
+ "Use custom stylesheet at PATH to add or override existing",
124
+ "CSS references in the source document."
125
+ ) { |value| options[:css] = File.expand_path(value) }
126
+
127
+ opts.on("-X", "--remove SELECTOR", String,
128
+ "Remove source element using XPath or CSS selector.",
129
+ "Use -X- to ignore stored profile."
130
+ ) { |value| value == '-' ? options[:remove] = [] : options[:remove] << value }
131
+
132
+ opts.on("-R", "--rx /PATTERN/REPLACEMENT/", String,
133
+ "Edit source HTML using regular expressions.",
134
+ "Use -R- to ignore stored profile."
135
+ ) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
136
+
137
+ opts.on("-B", "--browse",
138
+ "After processing, open resulting HTML in default browser."
139
+ ) { |value| options[:browser] = true }
140
+
141
+ end
142
+
143
+ if args.empty?
144
+ help parser
145
+ exit 1
146
+ end
147
+
148
+ begin
149
+ parser.parse! args
150
+ rescue OptionParser::ParseError => ex
151
+ STDERR.puts "ERROR: #{ex.to_s}. See '#{App.name} --help'."
152
+ exit 1
153
+ end
154
+
155
+ options[:url] = args.last
156
+ if options[:url].nil? || options[:url].empty?
157
+ help parser
158
+ STDERR.puts "ERROR: Please specify an URL."
159
+ exit 1
160
+ end
161
+ end
162
+
163
+ def help(opts)
164
+ puts opts
165
+ puts
166
+ puts " Current profile (#{options[:profile]}):"
167
+ dump_profile(options[:profile])
168
+ puts
169
+ end
170
+
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,139 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+
4
+ module Repub
5
+ class App
6
+ module Parser
7
+
8
+ class ParserException < RuntimeError; end
9
+
10
+ def parse(cache)
11
+ Parser.new(options).parse(cache)
12
+ end
13
+
14
+ # Default hpricot selectors
15
+ #
16
+ Selectors = {
17
+ :title => '//h1',
18
+ :toc => '//div.toc/ul',
19
+ :toc_item => '/li',
20
+ :toc_section => '/ul'
21
+ }
22
+
23
+ class Parser
24
+ include Logger
25
+
26
+ attr_reader :cache
27
+ attr_reader :uid
28
+ attr_reader :title
29
+ attr_reader :title_html
30
+ attr_reader :toc
31
+
32
+ def initialize(options)
33
+ @selectors = options[:selectors] || Selectors
34
+ @fixup = options[:fixup]
35
+ end
36
+
37
+ def parse(cache)
38
+ raise ParserException, "No HTML document found" if
39
+ cache.assets[:documents].empty?
40
+ raise ParserException, "More than one HTML document found, this is not supported (yet)" if
41
+ cache.assets[:documents].size > 1
42
+
43
+ @cache = cache
44
+ @asset = @cache.assets[:documents][0]
45
+ log.debug "-- Parsing #{@asset}"
46
+ @doc = Hpricot(open(File.join(@cache.path, @asset)), @fixup)
47
+
48
+ @uid = @cache.name
49
+ parse_title
50
+ parse_title_html
51
+ parse_toc
52
+
53
+ self
54
+ end
55
+
56
+ private
57
+
58
+ UNTITLED = 'Untitled'
59
+
60
+ def parse_title
61
+ log.debug "-- Looking for title with #{@selectors[:title]}"
62
+ el = @doc.at(@selectors[:title])
63
+ if el
64
+ if el.children.empty?
65
+ title_text = el.inner_text
66
+ else
67
+ title_text = el.children.map{|c| c.inner_text }.join(' ')
68
+ end
69
+ @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
70
+ log.info "Found title \"#{@title}\""
71
+ else
72
+ @title = UNTITLED
73
+ log.warn "** Could not parse document title, using '#{@title}'"
74
+ end
75
+ end
76
+
77
+ def parse_title_html
78
+ log.debug "-- Looking for html title with #{@selectors[:title]}"
79
+ el = @doc.at(@selectors[:title])
80
+ @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
81
+ end
82
+
83
+ class TocItem < Struct.new(
84
+ :title,
85
+ :uri,
86
+ :fragment_id
87
+ )
88
+
89
+ def initialize(title, uri_with_fragment_id, subitems, asset)
90
+ self.title = title
91
+ self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
92
+ self.uri = asset if self.uri.empty?
93
+ @subitems = subitems || []
94
+ end
95
+
96
+ attr_reader :subitems
97
+
98
+ def src
99
+ "#{uri}##{fragment_id}"
100
+ end
101
+ end
102
+
103
+ def parse_toc
104
+ log.debug "-- Looking for TOC with #{@selectors[:toc]}"
105
+ el = @doc.at(@selectors[:toc])
106
+ if el
107
+ @toc = parse_toc_section(el)
108
+ log.info "Found TOC with #{@toc.size} top-level items"
109
+ else
110
+ @toc = []
111
+ log.warn "** Could not parse document table of contents"
112
+ end
113
+ end
114
+
115
+ def parse_toc_section(section)
116
+ toc = []
117
+ log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
118
+ section.search(@selectors[:toc_item]).each do |item|
119
+ a = item.name == 'a' ? item : item.at('a')
120
+ next if a.nil?
121
+ href = a['href']
122
+ next if href.nil?
123
+ title = item.inner_text.gsub(/\s+/, ' ').strip
124
+ subitems = nil
125
+ log.debug "-- Found item: #{title}"
126
+ item.search(@selectors[:toc_section]).each do |subsection|
127
+ log.debug "-- Found section with #{@selectors[:toc_section]} >>>"
128
+ subitems = parse_toc_section(subsection)
129
+ log.debug '-- <<<'
130
+ end
131
+ toc << TocItem.new(title, href, subitems, @asset)
132
+ end
133
+ toc
134
+ end
135
+ end
136
+
137
+ end
138
+ end
139
+ end