invisiblellama-repub 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/History.txt +3 -0
- data/README.txt +95 -0
- data/Rakefile +30 -0
- data/TODO.txt +2 -0
- data/bin/repub +24 -0
- data/lib/repub.rb +46 -0
- data/lib/repub/app.rb +42 -0
- data/lib/repub/app/builder.rb +200 -0
- data/lib/repub/app/fetcher.rb +162 -0
- data/lib/repub/app/logger.rb +52 -0
- data/lib/repub/app/options.rb +173 -0
- data/lib/repub/app/parser.rb +139 -0
- data/lib/repub/app/profile.rb +91 -0
- data/lib/repub/app/utility.rb +57 -0
- data/lib/repub/epub.rb +3 -0
- data/lib/repub/epub/container.rb +28 -0
- data/lib/repub/epub/content.rb +153 -0
- data/lib/repub/epub/toc.rb +139 -0
- data/lib/repub/mobi/.githidden +0 -0
- data/test/epub/test_container.rb +15 -0
- data/test/epub/test_content.rb +56 -0
- data/test/epub/test_toc.rb +29 -0
- data/test/test_builder.rb +8 -0
- data/test/test_fetcher.rb +36 -0
- data/test/test_logger.rb +76 -0
- data/test/test_parser.rb +32 -0
- metadata +139 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'digest/sha1'
|
3
|
+
require 'uri'
|
4
|
+
require 'iconv'
|
5
|
+
require 'rubygems'
|
6
|
+
|
7
|
+
# XXX: suppress warnings from chardet (until they fix them)
|
8
|
+
$VERBOSE=false
|
9
|
+
require 'UniversalDetector'
|
10
|
+
$VERBOSE=true
|
11
|
+
|
12
|
+
module Repub
|
13
|
+
class App
|
14
|
+
module Fetcher
|
15
|
+
|
16
|
+
class FetcherException < RuntimeError; end
|
17
|
+
|
18
|
+
def fetch
|
19
|
+
Fetcher.new(options).fetch
|
20
|
+
end
|
21
|
+
|
22
|
+
AssetTypes = {
|
23
|
+
:documents => %w[html htm],
|
24
|
+
:stylesheets => %w[css],
|
25
|
+
:images => %w[jpg jpeg png gif svg]
|
26
|
+
}
|
27
|
+
|
28
|
+
class Fetcher
|
29
|
+
include Logger
|
30
|
+
|
31
|
+
Downloaders = {
|
32
|
+
:wget => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
|
33
|
+
:httrack => { :cmd => 'httrack', :options => '-gB -r2 +*.css +*.jpg -*.xml -*.html' }
|
34
|
+
}
|
35
|
+
|
36
|
+
def initialize(options)
|
37
|
+
@options = options
|
38
|
+
@downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
|
39
|
+
begin
|
40
|
+
downloader = Downloaders[@options[:helper].to_sym] rescue Downloaders[:wget]
|
41
|
+
log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
|
42
|
+
@downloader_path ||= which(downloader[:cmd])
|
43
|
+
@downloader_options ||= downloader[:options]
|
44
|
+
rescue RuntimeError
|
45
|
+
raise FetcherException, "unknown helper '#{@options[:helper]}'"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def fetch
|
50
|
+
url = @options[:url]
|
51
|
+
raise FetcherException, "empty URL" if !url || url.empty?
|
52
|
+
begin
|
53
|
+
URI.parse(url)
|
54
|
+
rescue
|
55
|
+
raise FetcherException, "invalid URL: #{url}"
|
56
|
+
end
|
57
|
+
cmd = "#{@downloader_path} #{@downloader_options} #{url}"
|
58
|
+
Cache.for_url(url) do |cache|
|
59
|
+
log.debug "-- Downloading into #{cache.path}"
|
60
|
+
unless system(cmd) && !cache.empty?
|
61
|
+
raise FetcherException, "Fetch failed."
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def which(cmd)
|
69
|
+
if !RUBY_PLATFORM.match('mswin')
|
70
|
+
cmd = `/usr/bin/which #{cmd}`.strip
|
71
|
+
raise FetcherException, "#{cmd}: helper not found." if cmd.empty?
|
72
|
+
end
|
73
|
+
cmd
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Cache
|
78
|
+
include Logger
|
79
|
+
|
80
|
+
def self.root
|
81
|
+
return File.join(App.data_path, 'cache')
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.inventorize
|
85
|
+
# TODO
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.cleanup
|
89
|
+
Dir.chdir(self.root) { FileUtils.rm_r(Dir.glob('*')) }
|
90
|
+
rescue
|
91
|
+
# ignore exceptions
|
92
|
+
end
|
93
|
+
|
94
|
+
attr_reader :url
|
95
|
+
attr_reader :name
|
96
|
+
attr_reader :path
|
97
|
+
attr_reader :assets
|
98
|
+
|
99
|
+
def self.for_url(url, &block)
|
100
|
+
self.new(url).for_url(&block)
|
101
|
+
end
|
102
|
+
|
103
|
+
def for_url(&block)
|
104
|
+
# if not yet cached, download stuff
|
105
|
+
unless File.exist?(@path)
|
106
|
+
FileUtils.mkdir_p(@path)
|
107
|
+
begin
|
108
|
+
Dir.chdir(@path) { yield self }
|
109
|
+
rescue
|
110
|
+
FileUtils.rm_r(@path)
|
111
|
+
raise
|
112
|
+
end
|
113
|
+
else
|
114
|
+
log.debug "-- Already cached in #{@path}"
|
115
|
+
end
|
116
|
+
# do post-download tasks
|
117
|
+
if File.exist?(@path)
|
118
|
+
Dir.chdir(@path) do
|
119
|
+
# enumerate assets
|
120
|
+
@assets = {}
|
121
|
+
AssetTypes.each_pair do |asset_type, file_types|
|
122
|
+
@assets[asset_type] ||= []
|
123
|
+
file_types.each do |file_type|
|
124
|
+
@assets[asset_type] << Dir.glob("*.#{file_type}")
|
125
|
+
end
|
126
|
+
@assets[asset_type].flatten!
|
127
|
+
end
|
128
|
+
# detect encoding and convert to utf-8 if needed
|
129
|
+
@assets[:documents].each do |doc|
|
130
|
+
log.debug "-- Detecting encoding for #{doc}"
|
131
|
+
s = IO.read(doc)
|
132
|
+
raise FetcherException, "empty document" unless s
|
133
|
+
encoding = UniversalDetector::chardet(s)['encoding']
|
134
|
+
if encoding.downcase != 'utf-8'
|
135
|
+
log.debug "-- Looks like it's #{encoding}, will convert to UTF-8"
|
136
|
+
s = Iconv.conv('utf-8', encoding, s)
|
137
|
+
File.open(doc, 'w') { |f| f.write(s) }
|
138
|
+
else
|
139
|
+
log.debug "-- Looks like it's UTF-8, no conversion needed"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
self
|
145
|
+
end
|
146
|
+
|
147
|
+
def empty?
|
148
|
+
Dir.glob(File.join(@path, '*')).empty?
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
def initialize(url)
|
154
|
+
@url = url
|
155
|
+
@name = Digest::SHA1.hexdigest(@url)
|
156
|
+
@path = File.join(Cache.root, @name)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module Repub
|
4
|
+
class App
|
5
|
+
module Logger
|
6
|
+
|
7
|
+
# Logging verbosity
|
8
|
+
#
|
9
|
+
LOGGER_QUIET = 0 # nothing except errors
|
10
|
+
LOGGER_NORMAL = 1 # info and above
|
11
|
+
LOGGER_VERBOSE = 2 # everything, including debuging noise
|
12
|
+
|
13
|
+
def log
|
14
|
+
Logger.instance
|
15
|
+
end
|
16
|
+
|
17
|
+
class Logger
|
18
|
+
include Singleton
|
19
|
+
|
20
|
+
attr_accessor :level
|
21
|
+
attr_accessor :stdout
|
22
|
+
attr_accessor :stderr
|
23
|
+
|
24
|
+
def debug(msg)
|
25
|
+
@stdout.puts(msg) if @level >= LOGGER_VERBOSE
|
26
|
+
end
|
27
|
+
|
28
|
+
def info(msg)
|
29
|
+
@stdout.puts(msg) if @level >= LOGGER_NORMAL
|
30
|
+
end
|
31
|
+
|
32
|
+
def error(msg)
|
33
|
+
@stderr.puts(msg) if @level >= LOGGER_QUIET
|
34
|
+
end
|
35
|
+
alias_method :warn, :error
|
36
|
+
|
37
|
+
def fatal(msg)
|
38
|
+
error(msg)
|
39
|
+
exit 1
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def initialize
|
44
|
+
@level = LOGGER_NORMAL
|
45
|
+
@stdout = STDOUT
|
46
|
+
@stderr = STDERR
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,173 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
module Repub
|
4
|
+
class App
|
5
|
+
module Options
|
6
|
+
|
7
|
+
attr_reader :options
|
8
|
+
|
9
|
+
def parse_options(args)
|
10
|
+
|
11
|
+
# Default options
|
12
|
+
@options = {
|
13
|
+
:browser => false,
|
14
|
+
:css => nil,
|
15
|
+
:encoding => nil,
|
16
|
+
:fixup => true,
|
17
|
+
:helper => 'wget',
|
18
|
+
:metadata => {},
|
19
|
+
:output_path => Dir.getwd,
|
20
|
+
:profile => 'default',
|
21
|
+
:remove => [],
|
22
|
+
:rx => [],
|
23
|
+
:selectors => Parser::Selectors,
|
24
|
+
:url => nil,
|
25
|
+
:verbosity => Repub::App::Logger::LOGGER_NORMAL,
|
26
|
+
}
|
27
|
+
|
28
|
+
# Load default profile
|
29
|
+
if load_profile(options[:profile]).empty?
|
30
|
+
write_profile(options[:profile])
|
31
|
+
end
|
32
|
+
|
33
|
+
# Parse command line
|
34
|
+
parser = OptionParser.new do |opts|
|
35
|
+
opts.banner = <<-BANNER.gsub(/^ /,'')
|
36
|
+
|
37
|
+
Repub is a simple HTML to ePub converter.
|
38
|
+
|
39
|
+
Usage: #{App.name} [options] url
|
40
|
+
|
41
|
+
General options:
|
42
|
+
BANNER
|
43
|
+
|
44
|
+
opts.on("-D", "--downloader NAME ", ['wget', 'httrack'],
|
45
|
+
"Which downloader to use to get files (wget or httrack).",
|
46
|
+
"Default is #{options[:helper]}."
|
47
|
+
) { |value| options[:helper] = value }
|
48
|
+
|
49
|
+
opts.on("-o", "--output PATH", String,
|
50
|
+
"Output path for generated ePub file.",
|
51
|
+
"Default is #{options[:output_path]}/<Parsed_Title>.epub"
|
52
|
+
) { |value| options[:output_path] = File.expand_path(value) }
|
53
|
+
|
54
|
+
opts.on("-w", "--write-profile NAME", String,
|
55
|
+
"Save given options for later reuse as profile NAME."
|
56
|
+
) { |value| options[:profile] = value; write_profile(value) }
|
57
|
+
|
58
|
+
opts.on("-l", "--load-profile NAME", String,
|
59
|
+
"Load options from saved profile NAME."
|
60
|
+
) { |value| options[:profile] = value; load_profile(value) }
|
61
|
+
|
62
|
+
opts.on("-W", "--write-default",
|
63
|
+
"Save given options for later reuse as default profile."
|
64
|
+
) { write_profile }
|
65
|
+
|
66
|
+
opts.on("-L", "--list-profiles",
|
67
|
+
"List saved profiles."
|
68
|
+
) { list_profiles; exit 1 }
|
69
|
+
|
70
|
+
opts.on("-C", "--cleanup",
|
71
|
+
"Clean up download cache."
|
72
|
+
) { Fetcher::Cache.cleanup; exit 1 }
|
73
|
+
|
74
|
+
opts.on("-v", "--verbose",
|
75
|
+
"Turn on verbose output."
|
76
|
+
) { options[:verbosity] = Repub::App::Logger::LOGGER_VERBOSE }
|
77
|
+
|
78
|
+
opts.on("-q", "--quiet",
|
79
|
+
"Turn off any output except errors."
|
80
|
+
) { options[:verbosity] = Repub::App::Logger::LOGGER_QUIET }
|
81
|
+
|
82
|
+
opts.on("-V", "--version",
|
83
|
+
"Show version."
|
84
|
+
) { puts Repub.version; exit 1 }
|
85
|
+
|
86
|
+
opts.on("-h", "--help",
|
87
|
+
"Show this help message."
|
88
|
+
) { help opts; exit 1 }
|
89
|
+
|
90
|
+
opts.separator ""
|
91
|
+
opts.separator " Parser options:"
|
92
|
+
|
93
|
+
opts.on("-x", "--selector NAME:VALUE", String,
|
94
|
+
"Set parser XPath or CSS selector NAME to VALUE.",
|
95
|
+
"Recognized selectors are: [title toc toc_item toc_section]"
|
96
|
+
) do |value|
|
97
|
+
name, value = value.split(/:/)
|
98
|
+
options[:selectors][name.to_sym] = value
|
99
|
+
end
|
100
|
+
|
101
|
+
opts.on("-m", "--meta NAME:VALUE", String,
|
102
|
+
"Set publication information metadata NAME to VALUE.",
|
103
|
+
"Valid metadata names are: [creator date description",
|
104
|
+
"language publisher relation rights subject title]"
|
105
|
+
) do |value|
|
106
|
+
name, value = value.split(/:/)
|
107
|
+
options[:metadata][name.to_sym] = value
|
108
|
+
end
|
109
|
+
|
110
|
+
opts.on("-F", "--no-fixup",
|
111
|
+
"Do not attempt to make document meet XHTML 1.0 Strict.",
|
112
|
+
"Default is to try and fix things that are broken. "
|
113
|
+
) { |value| options[:fixup] = false }
|
114
|
+
|
115
|
+
opts.on("-e", "--encoding NAME", String,
|
116
|
+
"Set source document encoding. Default is to autodetect."
|
117
|
+
) { |value| options[:encoding] = value }
|
118
|
+
|
119
|
+
opts.separator ""
|
120
|
+
opts.separator " Post-processing options:"
|
121
|
+
|
122
|
+
opts.on("-s", "--stylesheet PATH", String,
|
123
|
+
"Use custom stylesheet at PATH to add or override existing",
|
124
|
+
"CSS references in the source document."
|
125
|
+
) { |value| options[:css] = File.expand_path(value) }
|
126
|
+
|
127
|
+
opts.on("-X", "--remove SELECTOR", String,
|
128
|
+
"Remove source element using XPath or CSS selector.",
|
129
|
+
"Use -X- to ignore stored profile."
|
130
|
+
) { |value| value == '-' ? options[:remove] = [] : options[:remove] << value }
|
131
|
+
|
132
|
+
opts.on("-R", "--rx /PATTERN/REPLACEMENT/", String,
|
133
|
+
"Edit source HTML using regular expressions.",
|
134
|
+
"Use -R- to ignore stored profile."
|
135
|
+
) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
|
136
|
+
|
137
|
+
opts.on("-B", "--browse",
|
138
|
+
"After processing, open resulting HTML in default browser."
|
139
|
+
) { |value| options[:browser] = true }
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
if args.empty?
|
144
|
+
help parser
|
145
|
+
exit 1
|
146
|
+
end
|
147
|
+
|
148
|
+
begin
|
149
|
+
parser.parse! args
|
150
|
+
rescue OptionParser::ParseError => ex
|
151
|
+
STDERR.puts "ERROR: #{ex.to_s}. See '#{App.name} --help'."
|
152
|
+
exit 1
|
153
|
+
end
|
154
|
+
|
155
|
+
options[:url] = args.last
|
156
|
+
if options[:url].nil? || options[:url].empty?
|
157
|
+
help parser
|
158
|
+
STDERR.puts "ERROR: Please specify an URL."
|
159
|
+
exit 1
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def help(opts)
|
164
|
+
puts opts
|
165
|
+
puts
|
166
|
+
puts " Current profile (#{options[:profile]}):"
|
167
|
+
dump_profile(options[:profile])
|
168
|
+
puts
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Repub
|
5
|
+
class App
|
6
|
+
module Parser
|
7
|
+
|
8
|
+
class ParserException < RuntimeError; end
|
9
|
+
|
10
|
+
def parse(cache)
|
11
|
+
Parser.new(options).parse(cache)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Default hpricot selectors
|
15
|
+
#
|
16
|
+
Selectors = {
|
17
|
+
:title => '//h1',
|
18
|
+
:toc => '//div.toc/ul',
|
19
|
+
:toc_item => '/li',
|
20
|
+
:toc_section => '/ul'
|
21
|
+
}
|
22
|
+
|
23
|
+
class Parser
|
24
|
+
include Logger
|
25
|
+
|
26
|
+
attr_reader :cache
|
27
|
+
attr_reader :uid
|
28
|
+
attr_reader :title
|
29
|
+
attr_reader :title_html
|
30
|
+
attr_reader :toc
|
31
|
+
|
32
|
+
def initialize(options)
|
33
|
+
@selectors = options[:selectors] || Selectors
|
34
|
+
@fixup = options[:fixup]
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse(cache)
|
38
|
+
raise ParserException, "No HTML document found" if
|
39
|
+
cache.assets[:documents].empty?
|
40
|
+
raise ParserException, "More than one HTML document found, this is not supported (yet)" if
|
41
|
+
cache.assets[:documents].size > 1
|
42
|
+
|
43
|
+
@cache = cache
|
44
|
+
@asset = @cache.assets[:documents][0]
|
45
|
+
log.debug "-- Parsing #{@asset}"
|
46
|
+
@doc = Hpricot(open(File.join(@cache.path, @asset)), @fixup)
|
47
|
+
|
48
|
+
@uid = @cache.name
|
49
|
+
parse_title
|
50
|
+
parse_title_html
|
51
|
+
parse_toc
|
52
|
+
|
53
|
+
self
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
UNTITLED = 'Untitled'
|
59
|
+
|
60
|
+
def parse_title
|
61
|
+
log.debug "-- Looking for title with #{@selectors[:title]}"
|
62
|
+
el = @doc.at(@selectors[:title])
|
63
|
+
if el
|
64
|
+
if el.children.empty?
|
65
|
+
title_text = el.inner_text
|
66
|
+
else
|
67
|
+
title_text = el.children.map{|c| c.inner_text }.join(' ')
|
68
|
+
end
|
69
|
+
@title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
70
|
+
log.info "Found title \"#{@title}\""
|
71
|
+
else
|
72
|
+
@title = UNTITLED
|
73
|
+
log.warn "** Could not parse document title, using '#{@title}'"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def parse_title_html
|
78
|
+
log.debug "-- Looking for html title with #{@selectors[:title]}"
|
79
|
+
el = @doc.at(@selectors[:title])
|
80
|
+
@title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
|
81
|
+
end
|
82
|
+
|
83
|
+
class TocItem < Struct.new(
|
84
|
+
:title,
|
85
|
+
:uri,
|
86
|
+
:fragment_id
|
87
|
+
)
|
88
|
+
|
89
|
+
def initialize(title, uri_with_fragment_id, subitems, asset)
|
90
|
+
self.title = title
|
91
|
+
self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
|
92
|
+
self.uri = asset if self.uri.empty?
|
93
|
+
@subitems = subitems || []
|
94
|
+
end
|
95
|
+
|
96
|
+
attr_reader :subitems
|
97
|
+
|
98
|
+
def src
|
99
|
+
"#{uri}##{fragment_id}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_toc
|
104
|
+
log.debug "-- Looking for TOC with #{@selectors[:toc]}"
|
105
|
+
el = @doc.at(@selectors[:toc])
|
106
|
+
if el
|
107
|
+
@toc = parse_toc_section(el)
|
108
|
+
log.info "Found TOC with #{@toc.size} top-level items"
|
109
|
+
else
|
110
|
+
@toc = []
|
111
|
+
log.warn "** Could not parse document table of contents"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def parse_toc_section(section)
|
116
|
+
toc = []
|
117
|
+
log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
|
118
|
+
section.search(@selectors[:toc_item]).each do |item|
|
119
|
+
a = item.name == 'a' ? item : item.at('a')
|
120
|
+
next if a.nil?
|
121
|
+
href = a['href']
|
122
|
+
next if href.nil?
|
123
|
+
title = item.inner_text.gsub(/\s+/, ' ').strip
|
124
|
+
subitems = nil
|
125
|
+
log.debug "-- Found item: #{title}"
|
126
|
+
item.search(@selectors[:toc_section]).each do |subsection|
|
127
|
+
log.debug "-- Found section with #{@selectors[:toc_section]} >>>"
|
128
|
+
subitems = parse_toc_section(subsection)
|
129
|
+
log.debug '-- <<<'
|
130
|
+
end
|
131
|
+
toc << TocItem.new(title, href, subitems, @asset)
|
132
|
+
end
|
133
|
+
toc
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|