repub 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +25 -9
- data/README.rdoc +46 -40
- data/Rakefile +1 -0
- data/bin/repub +1 -1
- data/lib/repub.rb +1 -1
- data/lib/repub/app.rb +3 -3
- data/lib/repub/app/builder.rb +84 -36
- data/lib/repub/app/fetcher.rb +13 -11
- data/lib/repub/app/options.rb +36 -5
- data/lib/repub/app/parser.rb +1 -1
- data/lib/repub/app/profile.rb +16 -15
- data/lib/repub/epub/container.rb +28 -28
- data/lib/repub/epub/content.rb +59 -34
- data/lib/repub/epub/toc.rb +139 -139
- data/repub.gemspec +3 -3
- data/test/data/custom.css +3 -0
- data/test/data/invisiblellama.png +0 -0
- data/test/data/test.css +5 -0
- data/test/data/test.html +60 -0
- data/test/epub/test_container.rb +4 -4
- data/test/epub/test_content.rb +42 -38
- data/test/epub/test_toc.rb +19 -7
- data/test/test_builder.rb +145 -1
- data/test/test_fetcher.rb +79 -20
- data/test/test_parser.rb +45 -32
- metadata +6 -2
data/History.txt
CHANGED
@@ -1,18 +1,34 @@
|
|
1
|
-
== 0.
|
2
|
-
|
3
|
-
* Initial release
|
1
|
+
== 0.3.3 / 2009-07-05
|
4
2
|
|
5
|
-
|
3
|
+
* New features
|
6
4
|
|
7
|
-
*
|
8
|
-
*
|
9
|
-
*
|
5
|
+
* Option to add external files to the generated ePub (e.g. cover images, logos etc)
|
6
|
+
* Option to insert HTML fragments before/after specific element
|
7
|
+
* It is now possible to instruct repub to remove all links to CSS and <style> elements from source doc
|
10
8
|
|
11
|
-
|
9
|
+
* Bug fixes
|
12
10
|
|
13
|
-
*
|
11
|
+
* Metadata double namespace prefix
|
12
|
+
* Encoding autodetection now is done only once after download (as it was supposed to be)
|
13
|
+
* -e flag actually works
|
14
|
+
* Source doc content-type encoding now is always set to utf-8
|
15
|
+
* Fixed warnings in Profile helper under Ruby 1.9.1
|
14
16
|
|
15
17
|
== 0.3.2 / 2009-06-30
|
16
18
|
|
17
19
|
* Improved Win32 support
|
18
20
|
* Updated documentation
|
21
|
+
|
22
|
+
== 0.3.1 / 2009-06-28
|
23
|
+
|
24
|
+
* Fixed App.data_path bug
|
25
|
+
|
26
|
+
== 0.3.0 / 2009-06-28
|
27
|
+
|
28
|
+
* Switched to Nokogiri for HTML parsing
|
29
|
+
* Better parsing for hierarchical TOCs
|
30
|
+
* Many bug fixes
|
31
|
+
|
32
|
+
== 0.2.1 / 2009-06-26
|
33
|
+
|
34
|
+
* Initial release
|
data/README.rdoc
CHANGED
@@ -67,7 +67,7 @@ For example, if you later decide to regenerate Git Manual ePub without TOC at th
|
|
67
67
|
|
68
68
|
repub -l git-manual -X '//div[@class="toc"]' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
69
69
|
|
70
|
-
|
70
|
+
Few more examples:
|
71
71
|
|
72
72
|
* GNU Wget Manual
|
73
73
|
|
@@ -81,47 +81,49 @@ A few more examples:
|
|
81
81
|
repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' \
|
82
82
|
http://www.gutenberg.org/files/11/11-h/11-h.htm
|
83
83
|
|
84
|
-
* The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
|
85
|
-
|
86
|
-
repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
87
|
-
|
88
84
|
== SYNOPSIS:
|
89
85
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
86
|
+
Repub is a simple HTML to ePub converter.
|
87
|
+
|
88
|
+
Usage: repub [options] url
|
89
|
+
|
90
|
+
General options:
|
91
|
+
-D, --downloader NAME Which downloader to use to get files (wget or httrack).
|
92
|
+
Default is wget.
|
93
|
+
-o, --output PATH Output path for generated ePub file.
|
94
|
+
Default is /Users/dg/Projects/repub/<Parsed_Title>.epub
|
95
|
+
-w, --write-profile NAME Save given options for later reuse as profile NAME.
|
96
|
+
-l, --load-profile NAME Load options from saved profile NAME.
|
97
|
+
-W, --write-default Save given options for later reuse as default profile.
|
98
|
+
-L, --list-profiles List saved profiles.
|
99
|
+
-C, --cleanup Clean up download cache.
|
100
|
+
-v, --verbose Turn on verbose output.
|
101
|
+
-q, --quiet Turn off any output except errors.
|
102
|
+
-V, --version Show version.
|
103
|
+
-h, --help Show this help message.
|
104
|
+
|
105
|
+
Parser options:
|
106
|
+
-x, --selector NAME:VALUE Set parser XPath selector NAME to VALUE.
|
107
|
+
Recognized selectors are: [title toc toc_item toc_section]
|
108
|
+
-m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
|
109
|
+
Valid metadata names are: [creator date description
|
110
|
+
language publisher relation rights subject title]
|
111
|
+
-F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
|
112
|
+
Default is to try and fix things that are broken.
|
113
|
+
-e, --encoding NAME Set source document encoding. Default is to autodetect.
|
114
|
+
|
115
|
+
Post-processing options:
|
116
|
+
-s, --stylesheet PATH Use custom stylesheet at PATH. Use -s- to remove
|
117
|
+
all links to stylesheets and <style> blocks from the source.
|
118
|
+
-a, --add PATH Add external file to the generated ePub.
|
119
|
+
-N, --new-fragment XHTML Prepare document fragment for -A and -P operations.
|
120
|
+
-A, --after SELECTOR Insert fragment after element with XPath selector.
|
121
|
+
-P, --before SELECTOR Insert fragment before element with XPath selector.
|
122
|
+
-X, --remove SELECTOR Remove source element using XPath selector.
|
123
|
+
Use -X- to ignore stored profile.
|
124
|
+
-R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
|
125
|
+
Use -R- to ignore stored profile.
|
126
|
+
-B, --browser After processing, open resulting HTML in default browser.
|
125
127
|
|
126
128
|
== DEPENDENCIES:
|
127
129
|
|
@@ -140,6 +142,10 @@ Also, the following tools must be somewhere in $PATH:
|
|
140
142
|
Currently, only "everything-on-one-page" HTML sources are supported. Repub will download and process all page requisites
|
141
143
|
(stylesheets and images) but all actual content must be on one page.
|
142
144
|
|
145
|
+
Encoding auto-detection is slow.
|
146
|
+
|
147
|
+
Chardet 0.9.0 is broken under Ruby 1.9.
|
148
|
+
|
143
149
|
Bugs: probably. If you find any, please report them to dg at invisiblellama dot net.
|
144
150
|
|
145
151
|
== INSTALL:
|
data/Rakefile
CHANGED
data/bin/repub
CHANGED
data/lib/repub.rb
CHANGED
data/lib/repub/app.rb
CHANGED
@@ -31,10 +31,10 @@ module Repub
|
|
31
31
|
|
32
32
|
log.level = options[:verbosity]
|
33
33
|
log.info "Making ePub from #{options[:url]}"
|
34
|
-
|
35
|
-
log.info "Saved #{
|
34
|
+
builder = build(parse(fetch))
|
35
|
+
log.info "Saved #{builder.output_path}"
|
36
36
|
|
37
|
-
Launchy::Browser.run(
|
37
|
+
Launchy::Browser.run(builder.document_path) if options[:browser]
|
38
38
|
|
39
39
|
rescue RuntimeError => ex
|
40
40
|
log.fatal "** ERROR: #{ex.to_s}"
|
data/lib/repub/app/builder.rb
CHANGED
@@ -16,7 +16,7 @@ module Repub
|
|
16
16
|
include Epub, Logger
|
17
17
|
|
18
18
|
attr_reader :output_path
|
19
|
-
attr_reader :
|
19
|
+
attr_reader :document_path
|
20
20
|
|
21
21
|
def initialize(options)
|
22
22
|
@options = options
|
@@ -78,59 +78,69 @@ module Repub
|
|
78
78
|
|
79
79
|
def copy_and_process_assets
|
80
80
|
# Copy html
|
81
|
-
@parser.cache.assets[:documents].each do |
|
82
|
-
log.debug "-- Processing document #{
|
81
|
+
@parser.cache.assets[:documents].each do |doc|
|
82
|
+
log.debug "-- Processing document #{doc}"
|
83
83
|
# Copy asset from cache
|
84
|
-
FileUtils.cp(File.join(@parser.cache.path,
|
84
|
+
FileUtils.cp(File.join(@parser.cache.path, doc), '.')
|
85
85
|
# Do post-processing
|
86
|
-
postprocess_file(
|
87
|
-
postprocess_doc(
|
88
|
-
@content.
|
89
|
-
@
|
86
|
+
postprocess_file(doc)
|
87
|
+
postprocess_doc(doc)
|
88
|
+
@content.add_item(doc)
|
89
|
+
@document_path = File.expand_path(doc)
|
90
90
|
end
|
91
|
+
|
91
92
|
# Copy css
|
92
93
|
if @options[:css].nil? || @options[:css].empty?
|
93
94
|
# No custom css, copy one from assets
|
94
95
|
@parser.cache.assets[:stylesheets].each do |css|
|
95
96
|
log.debug "-- Copying stylesheet #{css}"
|
96
97
|
FileUtils.cp(File.join(@parser.cache.path, css), '.')
|
97
|
-
@content.
|
98
|
+
@content.add_item(css)
|
98
99
|
end
|
99
|
-
|
100
|
+
elsif @options[:css] != '-'
|
100
101
|
# Copy custom css
|
101
102
|
log.debug "-- Using custom stylesheet #{@options[:css]}"
|
102
103
|
FileUtils.cp(@options[:css], '.')
|
103
|
-
@content.
|
104
|
+
@content.add_item(File.basename(@options[:css]))
|
104
105
|
end
|
106
|
+
|
105
107
|
# Copy images
|
106
108
|
@parser.cache.assets[:images].each do |image|
|
107
109
|
log.debug "-- Copying image #{image}"
|
108
110
|
FileUtils.cp(File.join(@parser.cache.path, image), '.')
|
109
|
-
@content.
|
111
|
+
@content.add_item(image)
|
110
112
|
end
|
113
|
+
|
114
|
+
# Copy external custom files (-a option)
|
115
|
+
@options[:add].each do |file|
|
116
|
+
log.debug "-- Copying external file #{file}"
|
117
|
+
FileUtils.cp(file, '.')
|
118
|
+
@content.add_item(file)
|
119
|
+
end if @options[:add]
|
111
120
|
end
|
112
121
|
|
113
122
|
def postprocess_file(asset)
|
114
123
|
source = IO.read(asset)
|
124
|
+
|
115
125
|
# Do rx substitutions
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
end
|
126
|
+
@options[:rx].each do |rx|
|
127
|
+
rx.strip!
|
128
|
+
delimiter = rx[0, 1]
|
129
|
+
rx = rx.gsub(/\\#{delimiter}/, "\n")
|
130
|
+
ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
|
131
|
+
raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
|
132
|
+
pattern = ra[0]
|
133
|
+
replacement = ra[1] || ''
|
134
|
+
log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
|
135
|
+
source.gsub!(Regexp.new(pattern), replacement)
|
136
|
+
end if @options[:rx]
|
137
|
+
|
129
138
|
# Add doctype if missing
|
130
139
|
if source !~ /\s*<!DOCTYPE/
|
131
140
|
log.debug "-- Adding missing doctype"
|
132
141
|
source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
|
133
142
|
end
|
143
|
+
|
134
144
|
# Save processed file
|
135
145
|
File.open(asset, 'w') do |f|
|
136
146
|
f.write(source)
|
@@ -139,23 +149,61 @@ module Repub
|
|
139
149
|
|
140
150
|
def postprocess_doc(asset)
|
141
151
|
doc = Nokogiri::HTML.parse(IO.read(asset), nil, 'UTF-8')
|
142
|
-
|
143
|
-
|
144
|
-
|
152
|
+
|
153
|
+
# Set Content-Type charset to UTF-8
|
154
|
+
doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
|
155
|
+
el['content'] = 'text/html; charset=utf-8'
|
156
|
+
end
|
157
|
+
|
158
|
+
# Process styles
|
159
|
+
if @options[:css] && !@options[:css].empty?
|
160
|
+
# Remove all stylesheet links
|
161
|
+
doc.xpath('//head/link[@rel="stylesheet"]').remove
|
162
|
+
if @options[:css] == '-'
|
163
|
+
# Also remove all inline styles
|
164
|
+
doc.xpath('//head/style').remove
|
165
|
+
log.info "Removing all stylesheet links and style elements"
|
166
|
+
else
|
167
|
+
# Add custom stylesheet link
|
168
|
+
link = Nokogiri::XML::Node.new('link', doc)
|
169
|
+
link['rel'] = 'stylesheet'
|
170
|
+
link['type'] = 'text/css'
|
145
171
|
link['href'] = File.basename(@options[:css])
|
146
|
-
|
172
|
+
# Add as the last child so it has precedence over (possible) inline styles before
|
173
|
+
doc.at('//head').add_child(link)
|
174
|
+
log.info "Replacing CSS refs with \"#{link['href']}\""
|
147
175
|
end
|
148
176
|
end
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
177
|
+
|
178
|
+
# Insert elements after/before selector
|
179
|
+
@options[:after].each do |e|
|
180
|
+
selector = e.keys.first
|
181
|
+
fragment = e[selector]
|
182
|
+
element = doc.xpath(selector).first
|
183
|
+
if element
|
184
|
+
log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
|
185
|
+
fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
|
154
186
|
end
|
155
|
-
end
|
187
|
+
end if @options[:after]
|
188
|
+
@options[:before].each do |e|
|
189
|
+
selector = e.keys.first
|
190
|
+
fragment = e[selector]
|
191
|
+
element = doc.xpath(selector).first
|
192
|
+
if element
|
193
|
+
log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
|
194
|
+
fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
|
195
|
+
end
|
196
|
+
end if @options[:before]
|
197
|
+
|
198
|
+
# Remove elements
|
199
|
+
@options[:remove].each do |selector|
|
200
|
+
log.info "Removing elements \"#{selector}\""
|
201
|
+
doc.search(selector).remove
|
202
|
+
end if @options[:remove]
|
203
|
+
|
156
204
|
# Save processed doc
|
157
205
|
File.open(asset, 'w') do |f|
|
158
|
-
if @options[:fixup]
|
206
|
+
if @options[:fixup] || true
|
159
207
|
# HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
|
160
208
|
# in html node and adds them anyway. Just remove them here to avoid duplicates.
|
161
209
|
doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
|
data/lib/repub/app/fetcher.rb
CHANGED
@@ -4,6 +4,7 @@ require 'uri'
|
|
4
4
|
require 'iconv'
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
|
+
# Temporary disable warnings from chardet
|
7
8
|
old_verbose = $VERBOSE
|
8
9
|
$VERBOSE = false
|
9
10
|
require 'UniversalDetector'
|
@@ -24,26 +25,27 @@ module Repub
|
|
24
25
|
:stylesheets => %w[css],
|
25
26
|
:images => %w[jpg jpeg png gif svg]
|
26
27
|
}
|
27
|
-
|
28
|
+
|
28
29
|
class Fetcher
|
29
30
|
include Logger
|
30
31
|
|
31
32
|
Downloaders = {
|
32
33
|
:wget => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
|
33
|
-
:httrack => { :cmd => 'httrack', :options => '-
|
34
|
+
:httrack => { :cmd => 'httrack', :options => '-gBqQ -r2 +*.css +*.jpg -*.xml -*.html' }
|
34
35
|
}
|
35
36
|
|
36
37
|
def initialize(options)
|
37
38
|
@options = options
|
38
39
|
@downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
40
|
+
downloader =
|
41
|
+
begin
|
42
|
+
Downloaders[@options[:helper].to_sym] || Downloaders[:wget]
|
43
|
+
rescue
|
44
|
+
Downloaders[:wget]
|
45
|
+
end
|
46
|
+
log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
|
47
|
+
@downloader_path ||= which(downloader[:cmd])
|
48
|
+
@downloader_options ||= downloader[:options]
|
47
49
|
end
|
48
50
|
|
49
51
|
def fetch
|
@@ -82,7 +84,7 @@ module Repub
|
|
82
84
|
encoding = UniversalDetector.chardet(s)['encoding']
|
83
85
|
end
|
84
86
|
if encoding.downcase != 'utf-8'
|
85
|
-
log.info "Source encoding
|
87
|
+
log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
|
86
88
|
s = Iconv.conv('utf-8', encoding, IO.read(doc))
|
87
89
|
File.open(doc, 'w') { |f| f.write(s) }
|
88
90
|
end
|
data/lib/repub/app/options.rb
CHANGED
@@ -11,6 +11,9 @@ module Repub
|
|
11
11
|
|
12
12
|
# Default options
|
13
13
|
@options = {
|
14
|
+
:add => [],
|
15
|
+
:after => [],
|
16
|
+
:before => [],
|
14
17
|
:browser => false,
|
15
18
|
:css => nil,
|
16
19
|
:encoding => nil,
|
@@ -129,10 +132,38 @@ module Repub
|
|
129
132
|
opts.separator " Post-processing options:"
|
130
133
|
|
131
134
|
opts.on("-s", "--stylesheet PATH", String,
|
132
|
-
"Use custom stylesheet at PATH
|
133
|
-
"
|
134
|
-
) { |value| options[:css] = File.expand_path(value) }
|
135
|
+
"Use custom stylesheet at PATH. Use -s- to remove",
|
136
|
+
"all links to stylesheets and <style> blocks from the source."
|
137
|
+
) { |value| options[:css] = value == '-' ? value : File.expand_path(value) }
|
135
138
|
|
139
|
+
opts.on("-a", "--add PATH", String,
|
140
|
+
"Add external file to the generated ePub."
|
141
|
+
) { |value| options[:add] << File.expand_path(value) }
|
142
|
+
|
143
|
+
opts.on("-N", "--new-fragment XHTML", String,
|
144
|
+
"Prepare document fragment for -A and -P operations."
|
145
|
+
) do |value|
|
146
|
+
begin
|
147
|
+
@fragment = Nokogiri::HTML.fragment(value)
|
148
|
+
rescue Exception => ex
|
149
|
+
log.fatal "ERROR: invalid fragment: #{ex.to_s}"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
opts.on("-A", "--after SELECTOR", String,
|
154
|
+
"Insert fragment after element with XPath selector."
|
155
|
+
) do |value|
|
156
|
+
log.fatal "ERROR: -A requires a fragment. See '#{App.name} --help'." if !@fragment
|
157
|
+
@options[:after] << {value => @fragment.clone}
|
158
|
+
end
|
159
|
+
|
160
|
+
opts.on("-P", "--before SELECTOR", String,
|
161
|
+
"Insert fragment before element with XPath selector."
|
162
|
+
) do |value|
|
163
|
+
log.fatal "ERROR: -P requires a fragment. See '#{App.name} --help'." if !@fragment
|
164
|
+
@options[:before] << {value => @fragment.clone}
|
165
|
+
end
|
166
|
+
|
136
167
|
opts.on("-X", "--remove SELECTOR", String,
|
137
168
|
"Remove source element using XPath selector.",
|
138
169
|
"Use -X- to ignore stored profile."
|
@@ -143,7 +174,7 @@ module Repub
|
|
143
174
|
"Use -R- to ignore stored profile."
|
144
175
|
) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
|
145
176
|
|
146
|
-
opts.on("-B", "--
|
177
|
+
opts.on("-B", "--browser",
|
147
178
|
"After processing, open resulting HTML in default browser."
|
148
179
|
) { |value| options[:browser] = true }
|
149
180
|
|
@@ -177,4 +208,4 @@ module Repub
|
|
177
208
|
|
178
209
|
end
|
179
210
|
end
|
180
|
-
end
|
211
|
+
end
|