repub 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +25 -9
- data/README.rdoc +46 -40
- data/Rakefile +1 -0
- data/bin/repub +1 -1
- data/lib/repub.rb +1 -1
- data/lib/repub/app.rb +3 -3
- data/lib/repub/app/builder.rb +84 -36
- data/lib/repub/app/fetcher.rb +13 -11
- data/lib/repub/app/options.rb +36 -5
- data/lib/repub/app/parser.rb +1 -1
- data/lib/repub/app/profile.rb +16 -15
- data/lib/repub/epub/container.rb +28 -28
- data/lib/repub/epub/content.rb +59 -34
- data/lib/repub/epub/toc.rb +139 -139
- data/repub.gemspec +3 -3
- data/test/data/custom.css +3 -0
- data/test/data/invisiblellama.png +0 -0
- data/test/data/test.css +5 -0
- data/test/data/test.html +60 -0
- data/test/epub/test_container.rb +4 -4
- data/test/epub/test_content.rb +42 -38
- data/test/epub/test_toc.rb +19 -7
- data/test/test_builder.rb +145 -1
- data/test/test_fetcher.rb +79 -20
- data/test/test_parser.rb +45 -32
- metadata +6 -2
data/History.txt
CHANGED
@@ -1,18 +1,34 @@
|
|
1
|
-
== 0.
|
2
|
-
|
3
|
-
* Initial release
|
1
|
+
== 0.3.3 / 2009-07-05
|
4
2
|
|
5
|
-
|
3
|
+
* New features
|
6
4
|
|
7
|
-
*
|
8
|
-
*
|
9
|
-
*
|
5
|
+
* Option to add external files to the generated ePub (e.g. cover images, logos etc)
|
6
|
+
* Option to insert HTML fragments before/after specific element
|
7
|
+
* It is now possible to instruct repub to remove all links to CSS and <style> elements from source doc
|
10
8
|
|
11
|
-
|
9
|
+
* Bug fixes
|
12
10
|
|
13
|
-
*
|
11
|
+
* Metadata double namespace prefix
|
12
|
+
* Encoding autodetection now is done only once after download (as it was supposed to be)
|
13
|
+
* -e flag actually works
|
14
|
+
* Source doc content-type encoding now is always set to utf-8
|
15
|
+
* Fixed warnings in Profile helper under Ruby 1.9.1
|
14
16
|
|
15
17
|
== 0.3.2 / 2009-06-30
|
16
18
|
|
17
19
|
* Improved Win32 support
|
18
20
|
* Updated documentation
|
21
|
+
|
22
|
+
== 0.3.1 / 2009-06-28
|
23
|
+
|
24
|
+
* Fixed App.data_path bug
|
25
|
+
|
26
|
+
== 0.3.0 / 2009-06-28
|
27
|
+
|
28
|
+
* Switched to Nokogiri for HTML parsing
|
29
|
+
* Better parsing for hierarchical TOCs
|
30
|
+
* Many bug fixes
|
31
|
+
|
32
|
+
== 0.2.1 / 2009-06-26
|
33
|
+
|
34
|
+
* Initial release
|
data/README.rdoc
CHANGED
@@ -67,7 +67,7 @@ For example, if you later decide to regenerate Git Manual ePub without TOC at th
|
|
67
67
|
|
68
68
|
repub -l git-manual -X '//div[@class="toc"]' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
69
69
|
|
70
|
-
|
70
|
+
Few more examples:
|
71
71
|
|
72
72
|
* GNU Wget Manual
|
73
73
|
|
@@ -81,47 +81,49 @@ A few more examples:
|
|
81
81
|
repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' \
|
82
82
|
http://www.gutenberg.org/files/11/11-h/11-h.htm
|
83
83
|
|
84
|
-
* The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
|
85
|
-
|
86
|
-
repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
87
|
-
|
88
84
|
== SYNOPSIS:
|
89
85
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
86
|
+
Repub is a simple HTML to ePub converter.
|
87
|
+
|
88
|
+
Usage: repub [options] url
|
89
|
+
|
90
|
+
General options:
|
91
|
+
-D, --downloader NAME Which downloader to use to get files (wget or httrack).
|
92
|
+
Default is wget.
|
93
|
+
-o, --output PATH Output path for generated ePub file.
|
94
|
+
Default is /Users/dg/Projects/repub/<Parsed_Title>.epub
|
95
|
+
-w, --write-profile NAME Save given options for later reuse as profile NAME.
|
96
|
+
-l, --load-profile NAME Load options from saved profile NAME.
|
97
|
+
-W, --write-default Save given options for later reuse as default profile.
|
98
|
+
-L, --list-profiles List saved profiles.
|
99
|
+
-C, --cleanup Clean up download cache.
|
100
|
+
-v, --verbose Turn on verbose output.
|
101
|
+
-q, --quiet Turn off any output except errors.
|
102
|
+
-V, --version Show version.
|
103
|
+
-h, --help Show this help message.
|
104
|
+
|
105
|
+
Parser options:
|
106
|
+
-x, --selector NAME:VALUE Set parser XPath selector NAME to VALUE.
|
107
|
+
Recognized selectors are: [title toc toc_item toc_section]
|
108
|
+
-m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
|
109
|
+
Valid metadata names are: [creator date description
|
110
|
+
language publisher relation rights subject title]
|
111
|
+
-F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
|
112
|
+
Default is to try and fix things that are broken.
|
113
|
+
-e, --encoding NAME Set source document encoding. Default is to autodetect.
|
114
|
+
|
115
|
+
Post-processing options:
|
116
|
+
-s, --stylesheet PATH Use custom stylesheet at PATH. Use -s- to remove
|
117
|
+
all links to stylesheets and <style> blocks from the source.
|
118
|
+
-a, --add PATH Add external file to the generated ePub.
|
119
|
+
-N, --new-fragment XHTML Prepare document fragment for -A and -P operations.
|
120
|
+
-A, --after SELECTOR Insert fragment after element with XPath selector.
|
121
|
+
-P, --before SELECTOR Insert fragment before element with XPath selector.
|
122
|
+
-X, --remove SELECTOR Remove source element using XPath selector.
|
123
|
+
Use -X- to ignore stored profile.
|
124
|
+
-R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
|
125
|
+
Use -R- to ignore stored profile.
|
126
|
+
-B, --browser After processing, open resulting HTML in default browser.
|
125
127
|
|
126
128
|
== DEPENDENCIES:
|
127
129
|
|
@@ -140,6 +142,10 @@ Also, the following tools must be somewhere in $PATH:
|
|
140
142
|
Currently, only "everything-on-one-page" HTML sources are supported. Repub will download and process all page requisites
|
141
143
|
(stylesheets and images) but all actual content must be on one page.
|
142
144
|
|
145
|
+
Encoding auto-detection is slow.
|
146
|
+
|
147
|
+
Chardet 0.9.0 is broken under Ruby 1.9.
|
148
|
+
|
143
149
|
Bugs: probably. If you find any, please report them to dg at invisiblellama dot net.
|
144
150
|
|
145
151
|
== INSTALL:
|
data/Rakefile
CHANGED
data/bin/repub
CHANGED
data/lib/repub.rb
CHANGED
data/lib/repub/app.rb
CHANGED
@@ -31,10 +31,10 @@ module Repub
|
|
31
31
|
|
32
32
|
log.level = options[:verbosity]
|
33
33
|
log.info "Making ePub from #{options[:url]}"
|
34
|
-
|
35
|
-
log.info "Saved #{
|
34
|
+
builder = build(parse(fetch))
|
35
|
+
log.info "Saved #{builder.output_path}"
|
36
36
|
|
37
|
-
Launchy::Browser.run(
|
37
|
+
Launchy::Browser.run(builder.document_path) if options[:browser]
|
38
38
|
|
39
39
|
rescue RuntimeError => ex
|
40
40
|
log.fatal "** ERROR: #{ex.to_s}"
|
data/lib/repub/app/builder.rb
CHANGED
@@ -16,7 +16,7 @@ module Repub
|
|
16
16
|
include Epub, Logger
|
17
17
|
|
18
18
|
attr_reader :output_path
|
19
|
-
attr_reader :
|
19
|
+
attr_reader :document_path
|
20
20
|
|
21
21
|
def initialize(options)
|
22
22
|
@options = options
|
@@ -78,59 +78,69 @@ module Repub
|
|
78
78
|
|
79
79
|
def copy_and_process_assets
|
80
80
|
# Copy html
|
81
|
-
@parser.cache.assets[:documents].each do |
|
82
|
-
log.debug "-- Processing document #{
|
81
|
+
@parser.cache.assets[:documents].each do |doc|
|
82
|
+
log.debug "-- Processing document #{doc}"
|
83
83
|
# Copy asset from cache
|
84
|
-
FileUtils.cp(File.join(@parser.cache.path,
|
84
|
+
FileUtils.cp(File.join(@parser.cache.path, doc), '.')
|
85
85
|
# Do post-processing
|
86
|
-
postprocess_file(
|
87
|
-
postprocess_doc(
|
88
|
-
@content.
|
89
|
-
@
|
86
|
+
postprocess_file(doc)
|
87
|
+
postprocess_doc(doc)
|
88
|
+
@content.add_item(doc)
|
89
|
+
@document_path = File.expand_path(doc)
|
90
90
|
end
|
91
|
+
|
91
92
|
# Copy css
|
92
93
|
if @options[:css].nil? || @options[:css].empty?
|
93
94
|
# No custom css, copy one from assets
|
94
95
|
@parser.cache.assets[:stylesheets].each do |css|
|
95
96
|
log.debug "-- Copying stylesheet #{css}"
|
96
97
|
FileUtils.cp(File.join(@parser.cache.path, css), '.')
|
97
|
-
@content.
|
98
|
+
@content.add_item(css)
|
98
99
|
end
|
99
|
-
|
100
|
+
elsif @options[:css] != '-'
|
100
101
|
# Copy custom css
|
101
102
|
log.debug "-- Using custom stylesheet #{@options[:css]}"
|
102
103
|
FileUtils.cp(@options[:css], '.')
|
103
|
-
@content.
|
104
|
+
@content.add_item(File.basename(@options[:css]))
|
104
105
|
end
|
106
|
+
|
105
107
|
# Copy images
|
106
108
|
@parser.cache.assets[:images].each do |image|
|
107
109
|
log.debug "-- Copying image #{image}"
|
108
110
|
FileUtils.cp(File.join(@parser.cache.path, image), '.')
|
109
|
-
@content.
|
111
|
+
@content.add_item(image)
|
110
112
|
end
|
113
|
+
|
114
|
+
# Copy external custom files (-a option)
|
115
|
+
@options[:add].each do |file|
|
116
|
+
log.debug "-- Copying external file #{file}"
|
117
|
+
FileUtils.cp(file, '.')
|
118
|
+
@content.add_item(file)
|
119
|
+
end if @options[:add]
|
111
120
|
end
|
112
121
|
|
113
122
|
def postprocess_file(asset)
|
114
123
|
source = IO.read(asset)
|
124
|
+
|
115
125
|
# Do rx substitutions
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
end
|
126
|
+
@options[:rx].each do |rx|
|
127
|
+
rx.strip!
|
128
|
+
delimiter = rx[0, 1]
|
129
|
+
rx = rx.gsub(/\\#{delimiter}/, "\n")
|
130
|
+
ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
|
131
|
+
raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
|
132
|
+
pattern = ra[0]
|
133
|
+
replacement = ra[1] || ''
|
134
|
+
log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
|
135
|
+
source.gsub!(Regexp.new(pattern), replacement)
|
136
|
+
end if @options[:rx]
|
137
|
+
|
129
138
|
# Add doctype if missing
|
130
139
|
if source !~ /\s*<!DOCTYPE/
|
131
140
|
log.debug "-- Adding missing doctype"
|
132
141
|
source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
|
133
142
|
end
|
143
|
+
|
134
144
|
# Save processed file
|
135
145
|
File.open(asset, 'w') do |f|
|
136
146
|
f.write(source)
|
@@ -139,23 +149,61 @@ module Repub
|
|
139
149
|
|
140
150
|
def postprocess_doc(asset)
|
141
151
|
doc = Nokogiri::HTML.parse(IO.read(asset), nil, 'UTF-8')
|
142
|
-
|
143
|
-
|
144
|
-
|
152
|
+
|
153
|
+
# Set Content-Type charset to UTF-8
|
154
|
+
doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
|
155
|
+
el['content'] = 'text/html; charset=utf-8'
|
156
|
+
end
|
157
|
+
|
158
|
+
# Process styles
|
159
|
+
if @options[:css] && !@options[:css].empty?
|
160
|
+
# Remove all stylesheet links
|
161
|
+
doc.xpath('//head/link[@rel="stylesheet"]').remove
|
162
|
+
if @options[:css] == '-'
|
163
|
+
# Also remove all inline styles
|
164
|
+
doc.xpath('//head/style').remove
|
165
|
+
log.info "Removing all stylesheet links and style elements"
|
166
|
+
else
|
167
|
+
# Add custom stylesheet link
|
168
|
+
link = Nokogiri::XML::Node.new('link', doc)
|
169
|
+
link['rel'] = 'stylesheet'
|
170
|
+
link['type'] = 'text/css'
|
145
171
|
link['href'] = File.basename(@options[:css])
|
146
|
-
|
172
|
+
# Add as the last child so it has precedence over (possible) inline styles before
|
173
|
+
doc.at('//head').add_child(link)
|
174
|
+
log.info "Replacing CSS refs with \"#{link['href']}\""
|
147
175
|
end
|
148
176
|
end
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
177
|
+
|
178
|
+
# Insert elements after/before selector
|
179
|
+
@options[:after].each do |e|
|
180
|
+
selector = e.keys.first
|
181
|
+
fragment = e[selector]
|
182
|
+
element = doc.xpath(selector).first
|
183
|
+
if element
|
184
|
+
log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
|
185
|
+
fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
|
154
186
|
end
|
155
|
-
end
|
187
|
+
end if @options[:after]
|
188
|
+
@options[:before].each do |e|
|
189
|
+
selector = e.keys.first
|
190
|
+
fragment = e[selector]
|
191
|
+
element = doc.xpath(selector).first
|
192
|
+
if element
|
193
|
+
log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
|
194
|
+
fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
|
195
|
+
end
|
196
|
+
end if @options[:before]
|
197
|
+
|
198
|
+
# Remove elements
|
199
|
+
@options[:remove].each do |selector|
|
200
|
+
log.info "Removing elements \"#{selector}\""
|
201
|
+
doc.search(selector).remove
|
202
|
+
end if @options[:remove]
|
203
|
+
|
156
204
|
# Save processed doc
|
157
205
|
File.open(asset, 'w') do |f|
|
158
|
-
if @options[:fixup]
|
206
|
+
if @options[:fixup] || true
|
159
207
|
# HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
|
160
208
|
# in html node and adds them anyway. Just remove them here to avoid duplicates.
|
161
209
|
doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
|
data/lib/repub/app/fetcher.rb
CHANGED
@@ -4,6 +4,7 @@ require 'uri'
|
|
4
4
|
require 'iconv'
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
|
+
# Temporary disable warnings from chardet
|
7
8
|
old_verbose = $VERBOSE
|
8
9
|
$VERBOSE = false
|
9
10
|
require 'UniversalDetector'
|
@@ -24,26 +25,27 @@ module Repub
|
|
24
25
|
:stylesheets => %w[css],
|
25
26
|
:images => %w[jpg jpeg png gif svg]
|
26
27
|
}
|
27
|
-
|
28
|
+
|
28
29
|
class Fetcher
|
29
30
|
include Logger
|
30
31
|
|
31
32
|
Downloaders = {
|
32
33
|
:wget => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
|
33
|
-
:httrack => { :cmd => 'httrack', :options => '-
|
34
|
+
:httrack => { :cmd => 'httrack', :options => '-gBqQ -r2 +*.css +*.jpg -*.xml -*.html' }
|
34
35
|
}
|
35
36
|
|
36
37
|
def initialize(options)
|
37
38
|
@options = options
|
38
39
|
@downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
40
|
+
downloader =
|
41
|
+
begin
|
42
|
+
Downloaders[@options[:helper].to_sym] || Downloaders[:wget]
|
43
|
+
rescue
|
44
|
+
Downloaders[:wget]
|
45
|
+
end
|
46
|
+
log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
|
47
|
+
@downloader_path ||= which(downloader[:cmd])
|
48
|
+
@downloader_options ||= downloader[:options]
|
47
49
|
end
|
48
50
|
|
49
51
|
def fetch
|
@@ -82,7 +84,7 @@ module Repub
|
|
82
84
|
encoding = UniversalDetector.chardet(s)['encoding']
|
83
85
|
end
|
84
86
|
if encoding.downcase != 'utf-8'
|
85
|
-
log.info "Source encoding
|
87
|
+
log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
|
86
88
|
s = Iconv.conv('utf-8', encoding, IO.read(doc))
|
87
89
|
File.open(doc, 'w') { |f| f.write(s) }
|
88
90
|
end
|
data/lib/repub/app/options.rb
CHANGED
@@ -11,6 +11,9 @@ module Repub
|
|
11
11
|
|
12
12
|
# Default options
|
13
13
|
@options = {
|
14
|
+
:add => [],
|
15
|
+
:after => [],
|
16
|
+
:before => [],
|
14
17
|
:browser => false,
|
15
18
|
:css => nil,
|
16
19
|
:encoding => nil,
|
@@ -129,10 +132,38 @@ module Repub
|
|
129
132
|
opts.separator " Post-processing options:"
|
130
133
|
|
131
134
|
opts.on("-s", "--stylesheet PATH", String,
|
132
|
-
"Use custom stylesheet at PATH
|
133
|
-
"
|
134
|
-
) { |value| options[:css] = File.expand_path(value) }
|
135
|
+
"Use custom stylesheet at PATH. Use -s- to remove",
|
136
|
+
"all links to stylesheets and <style> blocks from the source."
|
137
|
+
) { |value| options[:css] = value == '-' ? value : File.expand_path(value) }
|
135
138
|
|
139
|
+
opts.on("-a", "--add PATH", String,
|
140
|
+
"Add external file to the generated ePub."
|
141
|
+
) { |value| options[:add] << File.expand_path(value) }
|
142
|
+
|
143
|
+
opts.on("-N", "--new-fragment XHTML", String,
|
144
|
+
"Prepare document fragment for -A and -P operations."
|
145
|
+
) do |value|
|
146
|
+
begin
|
147
|
+
@fragment = Nokogiri::HTML.fragment(value)
|
148
|
+
rescue Exception => ex
|
149
|
+
log.fatal "ERROR: invalid fragment: #{ex.to_s}"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
opts.on("-A", "--after SELECTOR", String,
|
154
|
+
"Insert fragment after element with XPath selector."
|
155
|
+
) do |value|
|
156
|
+
log.fatal "ERROR: -A requires a fragment. See '#{App.name} --help'." if !@fragment
|
157
|
+
@options[:after] << {value => @fragment.clone}
|
158
|
+
end
|
159
|
+
|
160
|
+
opts.on("-P", "--before SELECTOR", String,
|
161
|
+
"Insert fragment before element with XPath selector."
|
162
|
+
) do |value|
|
163
|
+
log.fatal "ERROR: -P requires a fragment. See '#{App.name} --help'." if !@fragment
|
164
|
+
@options[:before] << {value => @fragment.clone}
|
165
|
+
end
|
166
|
+
|
136
167
|
opts.on("-X", "--remove SELECTOR", String,
|
137
168
|
"Remove source element using XPath selector.",
|
138
169
|
"Use -X- to ignore stored profile."
|
@@ -143,7 +174,7 @@ module Repub
|
|
143
174
|
"Use -R- to ignore stored profile."
|
144
175
|
) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
|
145
176
|
|
146
|
-
opts.on("-B", "--
|
177
|
+
opts.on("-B", "--browser",
|
147
178
|
"After processing, open resulting HTML in default browser."
|
148
179
|
) { |value| options[:browser] = true }
|
149
180
|
|
@@ -177,4 +208,4 @@ module Repub
|
|
177
208
|
|
178
209
|
end
|
179
210
|
end
|
180
|
-
end
|
211
|
+
end
|