invisiblellama-repub 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +11 -1
- data/README.txt +30 -19
- data/Rakefile +2 -2
- data/SAMPLES.txt +23 -0
- data/{TODO.txt → TODO} +1 -0
- data/bin/repub +1 -17
- data/lib/repub.rb +1 -1
- data/lib/repub/app.rb +4 -2
- data/lib/repub/app/builder.rb +16 -8
- data/lib/repub/app/fetcher.rb +25 -23
- data/lib/repub/app/options.rb +15 -8
- data/lib/repub/app/parser.rb +32 -19
- data/repub.gemspec +48 -0
- data/test/epub/test_container.rb +2 -2
- data/test/epub/test_content.rb +3 -3
- data/test/epub/test_toc.rb +3 -3
- data/test/test_builder.rb +1 -1
- data/test/test_fetcher.rb +36 -36
- metadata +12 -13
- data/.gitignore +0 -4
- data/lib/repub/mobi/.githidden +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
-
== 0.1 / 2009-06-26
|
1
|
+
== 0.2.1 / 2009-06-26
|
2
2
|
|
3
3
|
* Initial release
|
4
|
+
|
5
|
+
== 0.3.0 / 2009-06-28
|
6
|
+
|
7
|
+
* Switched to Nokogiri for HTML parsing
|
8
|
+
* Better parsing for hierarchical TOCs
|
9
|
+
* Many bug fixes
|
10
|
+
|
11
|
+
== 0.3.1 / 2009-06-28
|
12
|
+
|
13
|
+
* Fixed App.data_path bug
|
data/README.txt
CHANGED
@@ -1,27 +1,31 @@
|
|
1
1
|
== DESCRIPTION:
|
2
2
|
|
3
|
-
|
3
|
+
Simple HTML to ePub converter.
|
4
4
|
|
5
5
|
== FEATURES/PROBLEMS:
|
6
6
|
|
7
|
-
Few samples to get started:
|
7
|
+
Few samples to get started:
|
8
|
+
|
9
|
+
* Git User's Manual
|
10
|
+
|
11
|
+
repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' \
|
12
|
+
http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
8
13
|
|
9
14
|
* Project Gutenberg's THE ADVENTURES OF SHERLOCK HOLMES
|
10
|
-
|
11
|
-
-
|
12
|
-
|
15
|
+
|
16
|
+
repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' \
|
17
|
+
-X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' \
|
18
|
+
http://www.gutenberg.org/dirs/etext99/advsh12h.htm
|
13
19
|
|
14
20
|
* Project Gutenberg's ALICE'S ADVENTURES IN WONDERLAND
|
15
|
-
|
16
|
-
-
|
17
|
-
|
21
|
+
|
22
|
+
repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' \
|
23
|
+
-X '//pre' -X '//hr' -X '//body/h4' \
|
24
|
+
http://www.gutenberg.org/files/11/11-h/11-h.htm
|
18
25
|
|
19
26
|
* The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
|
20
|
-
repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
21
27
|
|
22
|
-
|
23
|
-
repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' \
|
24
|
-
http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
28
|
+
repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
25
29
|
|
26
30
|
== SYNOPSIS:
|
27
31
|
|
@@ -43,7 +47,7 @@ General options:
|
|
43
47
|
-h, --help Show this help message.
|
44
48
|
|
45
49
|
Parser options:
|
46
|
-
-x, --selector NAME:VALUE Set parser XPath
|
50
|
+
-x, --selector NAME:VALUE Set parser XPath selector NAME to VALUE.
|
47
51
|
Recognized selectors are: [title toc toc_item toc_section]
|
48
52
|
-m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
|
49
53
|
Valid metadata names are: [creator date description
|
@@ -55,16 +59,21 @@ Parser options:
|
|
55
59
|
Post-processing options:
|
56
60
|
-s, --stylesheet PATH Use custom stylesheet at PATH to add or override existing
|
57
61
|
CSS references in the source document.
|
58
|
-
-X, --remove SELECTOR Remove source element using XPath
|
62
|
+
-X, --remove SELECTOR Remove source element using XPath selector.
|
59
63
|
Use -X- to ignore stored profile.
|
60
64
|
-R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
|
61
65
|
Use -R- to ignore stored profile.
|
62
66
|
-B, --browse After processing, open resulting HTML in default browser.
|
63
67
|
|
64
|
-
==
|
68
|
+
== DEPENDENCIES:
|
65
69
|
|
66
|
-
|
67
|
-
|
70
|
+
* Builder (https://rubyforge.org/projects/builder/)
|
71
|
+
* Nokogiri (http://nokogiri.rubyforge.org/nokogiri/)
|
72
|
+
* rchardet (https://rubyforge.org/projects/rchardet/)
|
73
|
+
* launchy (http://copiousfreetime.rubyforge.org/launchy/)
|
74
|
+
|
75
|
+
* wget or httrack
|
76
|
+
* zip (Info-ZIP)
|
68
77
|
|
69
78
|
== INSTALL:
|
70
79
|
|
@@ -72,9 +81,9 @@ Post-processing options:
|
|
72
81
|
|
73
82
|
== LICENSE:
|
74
83
|
|
75
|
-
The MIT License
|
84
|
+
(The MIT License)
|
76
85
|
|
77
|
-
Copyright (c) 2009 Invisible Llama
|
86
|
+
Copyright (c) 2009 Invisible Llama <dg@invisiblellama.net>
|
78
87
|
|
79
88
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
80
89
|
of this software and associated documentation files (the "Software"), to deal
|
@@ -93,3 +102,5 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
93
102
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
94
103
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
95
104
|
THE SOFTWARE.
|
105
|
+
|
106
|
+
==
|
data/Rakefile
CHANGED
@@ -20,11 +20,11 @@ PROJ.email = 'dg@invisiblellama.net'
|
|
20
20
|
PROJ.url = 'http://github.com/invisiblellama/repub/tree/master'
|
21
21
|
PROJ.version = Repub::VERSION
|
22
22
|
PROJ.rubyforge.name = 'repub'
|
23
|
-
PROJ.exclude = %w[tmp/ \.git
|
23
|
+
PROJ.exclude = %w[tmp/ \.git \.DS_Store .*\.tmproj .*\.epub ^pkg/]
|
24
24
|
|
25
25
|
PROJ.spec.opts << '--color'
|
26
26
|
|
27
|
+
depend_on 'nokogiri'
|
27
28
|
depend_on 'builder'
|
28
|
-
depend_on 'hpricot'
|
29
29
|
depend_on 'chardet'
|
30
30
|
depend_on 'launchy'
|
data/SAMPLES.txt
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
* THE ADVENTURES OF SHERLOCK HOLMES
|
2
|
+
|
3
|
+
repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
|
4
|
+
|
5
|
+
* ALICE'S ADVENTURES IN WONDERLAND
|
6
|
+
|
7
|
+
repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
|
8
|
+
|
9
|
+
* The Gelug-Kagyu Tradition of Mahamudra
|
10
|
+
|
11
|
+
repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
12
|
+
|
13
|
+
* Брюс Стерлинг. Схизматрица
|
14
|
+
|
15
|
+
repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
|
16
|
+
|
17
|
+
* Айзек Азимов. Космические течения
|
18
|
+
|
19
|
+
repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/FOUNDATION/currspac.txt_with-big-pictures.html
|
20
|
+
|
21
|
+
* Git User's Manual
|
22
|
+
|
23
|
+
repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
data/{TODO.txt → TODO}
RENAMED
data/bin/repub
CHANGED
@@ -1,24 +1,8 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby -w
|
2
2
|
|
3
3
|
require File.expand_path(
|
4
4
|
File.join(File.dirname(__FILE__), %w[.. lib repub]))
|
5
5
|
|
6
6
|
require 'repub/app'
|
7
7
|
|
8
|
-
# THE ADVENTURES OF SHERLOCK HOLMES
|
9
|
-
# repub -x 'title:body/h1' -x 'toc:body//table' 'toc_item://tr' -X 'body/pre,body//hr,body/h1,body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
|
10
|
-
#
|
11
|
-
# ALICE'S ADVENTURES IN WONDERLAND
|
12
|
-
# repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' -X 'body/pre,body//hr,body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
|
13
|
-
#
|
14
|
-
# The Gelug-Kagyu Tradition of Mahamudra
|
15
|
-
# http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
16
|
-
#
|
17
|
-
# Брюс Стерлинг. Схизматрица
|
18
|
-
# repub -x 'title://h2' -x 'toc:table' -x 'toc_item://a' -X 'div,table,//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
|
19
|
-
#
|
20
|
-
# Git User's Manual
|
21
|
-
# repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
22
|
-
|
23
|
-
|
24
8
|
Repub::App.instance.run ARGV
|
data/lib/repub.rb
CHANGED
data/lib/repub/app.rb
CHANGED
@@ -2,9 +2,9 @@ require 'singleton'
|
|
2
2
|
require 'rubygems'
|
3
3
|
require 'launchy'
|
4
4
|
require 'repub/app/utility'
|
5
|
+
require 'repub/app/logger'
|
5
6
|
require 'repub/app/options'
|
6
7
|
require 'repub/app/profile'
|
7
|
-
require 'repub/app/logger'
|
8
8
|
require 'repub/app/fetcher'
|
9
9
|
require 'repub/app/parser'
|
10
10
|
require 'repub/app/builder'
|
@@ -21,7 +21,9 @@ module Repub
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def self.data_path
|
24
|
-
File.join(File.expand_path('~'), '.repub')
|
24
|
+
data_path = File.join(File.expand_path('~'), '.repub')
|
25
|
+
FileUtils.mkdir_p(data_path) unless File.exist?(data_path)
|
26
|
+
data_path
|
25
27
|
end
|
26
28
|
|
27
29
|
def run(args)
|
data/lib/repub/app/builder.rb
CHANGED
@@ -97,18 +97,17 @@ module Repub
|
|
97
97
|
log.debug "-- Adding missing doctype"
|
98
98
|
source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
|
99
99
|
end
|
100
|
-
#
|
100
|
+
# Save processed file
|
101
101
|
File.open(asset, 'w') do |f|
|
102
102
|
f.write(source)
|
103
103
|
end
|
104
104
|
end
|
105
105
|
|
106
106
|
def postprocess_doc(asset)
|
107
|
-
|
108
|
-
|
109
|
-
# Substitute custom stylesheet
|
107
|
+
doc = Nokogiri::HTML.parse(open(asset), nil, 'UTF-8')
|
108
|
+
# Substitute custom CSS
|
110
109
|
if (@options[:css] && !@options[:css].empty?)
|
111
|
-
doc.
|
110
|
+
doc.xpath('//link[@rel="stylesheet"]') do |link|
|
112
111
|
link[:href] = File.basename(@options[:css])
|
113
112
|
log.debug "-- Replacing CSS refs with #{link[:href]}"
|
114
113
|
end
|
@@ -116,13 +115,22 @@ module Repub
|
|
116
115
|
# Remove elements
|
117
116
|
if @options[:remove] && !@options[:remove].empty?
|
118
117
|
@options[:remove].each do |selector|
|
119
|
-
log.info "Removing
|
118
|
+
log.info "Removing elements matching selector \"#{selector}\""
|
119
|
+
#p doc.search(selector).size
|
120
|
+
#p doc.search(selector)
|
120
121
|
doc.search(selector).remove
|
121
122
|
end
|
122
123
|
end
|
123
|
-
#
|
124
|
+
# Save processed doc
|
124
125
|
File.open(asset, 'w') do |f|
|
125
|
-
|
126
|
+
if @options[:fixup]
|
127
|
+
# HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
|
128
|
+
# in html node and adds them anyway. Just remove them here to avoid duplicates.
|
129
|
+
doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
|
130
|
+
doc.write_xhtml_to(f, :encoding => 'UTF-8')
|
131
|
+
else
|
132
|
+
doc.write_html_to(f, :encoding => 'UTF-8')
|
133
|
+
end
|
126
134
|
end
|
127
135
|
end
|
128
136
|
|
data/lib/repub/app/fetcher.rb
CHANGED
@@ -4,10 +4,10 @@ require 'uri'
|
|
4
4
|
require 'iconv'
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
|
-
|
8
|
-
$VERBOSE=false
|
7
|
+
old_verbose = $VERBOSE
|
8
|
+
$VERBOSE = false
|
9
9
|
require 'UniversalDetector'
|
10
|
-
$VERBOSE=
|
10
|
+
$VERBOSE = old_verbose
|
11
11
|
|
12
12
|
module Repub
|
13
13
|
class App
|
@@ -101,8 +101,9 @@ module Repub
|
|
101
101
|
end
|
102
102
|
|
103
103
|
def for_url(&block)
|
104
|
-
# if not yet cached
|
105
|
-
|
104
|
+
# Download stuff if not yet cached
|
105
|
+
cached = File.exist?(@path)
|
106
|
+
unless cached
|
106
107
|
FileUtils.mkdir_p(@path)
|
107
108
|
begin
|
108
109
|
Dir.chdir(@path) { yield self }
|
@@ -111,32 +112,33 @@ module Repub
|
|
111
112
|
raise
|
112
113
|
end
|
113
114
|
else
|
114
|
-
log.
|
115
|
+
log.info "Using cached assets"
|
116
|
+
log.debug "-- Cache is #{@path}"
|
115
117
|
end
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
@assets[asset_type] << Dir.glob("*.#{file_type}")
|
125
|
-
end
|
126
|
-
@assets[asset_type].flatten!
|
118
|
+
# Do post-download tasks
|
119
|
+
Dir.chdir(@path) do
|
120
|
+
# Enumerate assets
|
121
|
+
@assets = {}
|
122
|
+
AssetTypes.each_pair do |asset_type, file_types|
|
123
|
+
@assets[asset_type] ||= []
|
124
|
+
file_types.each do |file_type|
|
125
|
+
@assets[asset_type] << Dir.glob("*.#{file_type}")
|
127
126
|
end
|
128
|
-
|
127
|
+
@assets[asset_type].flatten!
|
128
|
+
end
|
129
|
+
# For freshly downloaded docs, detect encoding and convert to utf-8
|
130
|
+
unless cached
|
129
131
|
@assets[:documents].each do |doc|
|
130
|
-
log.
|
132
|
+
log.info "Detecting encoding for #{doc}"
|
131
133
|
s = IO.read(doc)
|
132
134
|
raise FetcherException, "empty document" unless s
|
133
|
-
encoding = UniversalDetector
|
135
|
+
encoding = UniversalDetector.chardet(s)['encoding']
|
134
136
|
if encoding.downcase != 'utf-8'
|
135
|
-
log.
|
136
|
-
s = Iconv.conv('utf-8', encoding,
|
137
|
+
log.info "Looks like #{encoding}, converting to UTF-8"
|
138
|
+
s = Iconv.conv('utf-8', encoding, IO.read(doc))
|
137
139
|
File.open(doc, 'w') { |f| f.write(s) }
|
138
140
|
else
|
139
|
-
log.
|
141
|
+
log.info "Looks like UTF-8, no conversion needed"
|
140
142
|
end
|
141
143
|
end
|
142
144
|
end
|
data/lib/repub/app/options.rb
CHANGED
@@ -3,6 +3,7 @@ require 'optparse'
|
|
3
3
|
module Repub
|
4
4
|
class App
|
5
5
|
module Options
|
6
|
+
include Logger
|
6
7
|
|
7
8
|
attr_reader :options
|
8
9
|
|
@@ -91,10 +92,14 @@ module Repub
|
|
91
92
|
opts.separator " Parser options:"
|
92
93
|
|
93
94
|
opts.on("-x", "--selector NAME:VALUE", String,
|
94
|
-
"Set parser XPath
|
95
|
+
"Set parser XPath selector NAME to VALUE.",
|
95
96
|
"Recognized selectors are: [title toc toc_item toc_section]"
|
96
97
|
) do |value|
|
97
|
-
|
98
|
+
begin
|
99
|
+
name, value = value.match(/([^:]+):(.*)/)[1, 2]
|
100
|
+
rescue
|
101
|
+
log.fatal "ERROR: invalid argument: -x '#{value}'. See '#{App.name} --help'."
|
102
|
+
end
|
98
103
|
options[:selectors][name.to_sym] = value
|
99
104
|
end
|
100
105
|
|
@@ -103,7 +108,11 @@ module Repub
|
|
103
108
|
"Valid metadata names are: [creator date description",
|
104
109
|
"language publisher relation rights subject title]"
|
105
110
|
) do |value|
|
106
|
-
|
111
|
+
begin
|
112
|
+
name, value = value.match(/([^:]+):(.*)/)[1, 2]
|
113
|
+
rescue
|
114
|
+
log.fatal "ERROR: invalid argument: -m '#{value}'. See '#{App.name} --help'."
|
115
|
+
end
|
107
116
|
options[:metadata][name.to_sym] = value
|
108
117
|
end
|
109
118
|
|
@@ -125,7 +134,7 @@ module Repub
|
|
125
134
|
) { |value| options[:css] = File.expand_path(value) }
|
126
135
|
|
127
136
|
opts.on("-X", "--remove SELECTOR", String,
|
128
|
-
"Remove source element using XPath
|
137
|
+
"Remove source element using XPath selector.",
|
129
138
|
"Use -X- to ignore stored profile."
|
130
139
|
) { |value| value == '-' ? options[:remove] = [] : options[:remove] << value }
|
131
140
|
|
@@ -148,15 +157,13 @@ module Repub
|
|
148
157
|
begin
|
149
158
|
parser.parse! args
|
150
159
|
rescue OptionParser::ParseError => ex
|
151
|
-
|
152
|
-
exit 1
|
160
|
+
log.fatal "ERROR: #{ex.to_s}. See '#{App.name} --help'."
|
153
161
|
end
|
154
162
|
|
155
163
|
options[:url] = args.last
|
156
164
|
if options[:url].nil? || options[:url].empty?
|
157
165
|
help parser
|
158
|
-
|
159
|
-
exit 1
|
166
|
+
log.fatal "ERROR: Please specify an URL."
|
160
167
|
end
|
161
168
|
end
|
162
169
|
|
data/lib/repub/app/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
3
|
|
4
4
|
module Repub
|
5
5
|
class App
|
@@ -11,13 +11,13 @@ module Repub
|
|
11
11
|
Parser.new(options).parse(cache)
|
12
12
|
end
|
13
13
|
|
14
|
-
# Default
|
14
|
+
# Default selectors
|
15
15
|
#
|
16
16
|
Selectors = {
|
17
17
|
:title => '//h1',
|
18
|
-
:toc => '//
|
19
|
-
:toc_item => '
|
20
|
-
:toc_section => '
|
18
|
+
:toc => '//ul',
|
19
|
+
:toc_item => './li',
|
20
|
+
:toc_section => './ul'
|
21
21
|
}
|
22
22
|
|
23
23
|
class Parser
|
@@ -43,7 +43,7 @@ module Repub
|
|
43
43
|
@cache = cache
|
44
44
|
@asset = @cache.assets[:documents][0]
|
45
45
|
log.debug "-- Parsing #{@asset}"
|
46
|
-
@doc =
|
46
|
+
@doc = Nokogiri::HTML.parse(open(File.join(@cache.path, @asset)), nil, 'UTF-8')
|
47
47
|
|
48
48
|
@uid = @cache.name
|
49
49
|
parse_title
|
@@ -64,13 +64,13 @@ module Repub
|
|
64
64
|
if el.children.empty?
|
65
65
|
title_text = el.inner_text
|
66
66
|
else
|
67
|
-
title_text =
|
67
|
+
title_text = el.children.map{|c| c.inner_text }.join(' ')
|
68
68
|
end
|
69
69
|
@title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
70
70
|
log.info "Found title \"#{@title}\""
|
71
71
|
else
|
72
72
|
@title = UNTITLED
|
73
|
-
log.warn "** Could not
|
73
|
+
log.warn "** Could not find document title, using '#{@title}'"
|
74
74
|
end
|
75
75
|
end
|
76
76
|
|
@@ -80,6 +80,8 @@ module Repub
|
|
80
80
|
@title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
|
81
81
|
end
|
82
82
|
|
83
|
+
# Helper container for TOC items
|
84
|
+
#
|
83
85
|
class TocItem < Struct.new(
|
84
86
|
:title,
|
85
87
|
:uri,
|
@@ -102,31 +104,42 @@ module Repub
|
|
102
104
|
|
103
105
|
def parse_toc
|
104
106
|
log.debug "-- Looking for TOC with #{@selectors[:toc]}"
|
105
|
-
el = @doc.
|
107
|
+
el = @doc.xpath(@selectors[:toc]).first
|
106
108
|
if el
|
107
109
|
@toc = parse_toc_section(el)
|
108
110
|
log.info "Found TOC with #{@toc.size} top-level items"
|
109
111
|
else
|
110
112
|
@toc = []
|
111
|
-
log.warn "** Could not
|
113
|
+
log.warn "** Could not find document table of contents"
|
112
114
|
end
|
113
115
|
end
|
114
116
|
|
115
117
|
def parse_toc_section(section)
|
116
118
|
toc = []
|
117
119
|
log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
|
118
|
-
section.
|
120
|
+
section.xpath(@selectors[:toc_item]).each do |item|
|
121
|
+
# Get item's anchor and href
|
119
122
|
a = item.name == 'a' ? item : item.at('a')
|
120
|
-
next if a
|
121
|
-
href = a[
|
122
|
-
next if href
|
123
|
-
|
124
|
-
|
123
|
+
next if !a
|
124
|
+
href = a[:href]
|
125
|
+
next if !href
|
126
|
+
# Is this a leaf item or node ?
|
127
|
+
subsection = item.xpath(@selectors[:toc_section]).first
|
128
|
+
if subsection
|
129
|
+
# Item has subsection, use anchor text for title
|
130
|
+
title = a.inner_text
|
131
|
+
else
|
132
|
+
# Leaf item, glue inner_text from all children
|
133
|
+
title = item.children.map{|c| c.inner_text }.join(' ')
|
134
|
+
end
|
135
|
+
title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
125
136
|
log.debug "-- Found item: #{title}"
|
126
|
-
|
127
|
-
|
137
|
+
# Parse sub-section
|
138
|
+
if subsection
|
139
|
+
log.debug "-- Found section with #{@selectors[:toc_section]}"
|
140
|
+
log.debug "-- >"
|
128
141
|
subitems = parse_toc_section(subsection)
|
129
|
-
log.debug '--
|
142
|
+
log.debug '-- .'
|
130
143
|
end
|
131
144
|
toc << TocItem.new(title, href, subitems, @asset)
|
132
145
|
end
|
data/repub.gemspec
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{repub}
|
5
|
+
s.version = "0.3.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Dmitri Goutnik"]
|
9
|
+
s.date = %q{2009-06-28}
|
10
|
+
s.default_executable = %q{repub}
|
11
|
+
s.description = %q{Simple HTML to ePub converter.}
|
12
|
+
s.email = %q{dg@invisiblellama.net}
|
13
|
+
s.executables = ["repub"]
|
14
|
+
s.extra_rdoc_files = ["History.txt", "README.txt", "SAMPLES.txt", "bin/repub"]
|
15
|
+
s.files = ["History.txt", "README.txt", "Rakefile", "SAMPLES.txt", "TODO", "bin/repub", "lib/repub.rb", "lib/repub/app.rb", "lib/repub/app/builder.rb", "lib/repub/app/fetcher.rb", "lib/repub/app/logger.rb", "lib/repub/app/options.rb", "lib/repub/app/parser.rb", "lib/repub/app/profile.rb", "lib/repub/app/utility.rb", "lib/repub/epub.rb", "lib/repub/epub/container.rb", "lib/repub/epub/content.rb", "lib/repub/epub/toc.rb", "repub.gemspec", "test/epub/test_container.rb", "test/epub/test_content.rb", "test/epub/test_toc.rb", "test/test_builder.rb", "test/test_fetcher.rb", "test/test_logger.rb", "test/test_parser.rb"]
|
16
|
+
s.homepage = %q{http://github.com/invisiblellama/repub/tree/master}
|
17
|
+
s.rdoc_options = ["--main", "README.txt"]
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.rubyforge_project = %q{repub}
|
20
|
+
s.rubygems_version = %q{1.3.4}
|
21
|
+
s.summary = %q{Simple HTML to ePub converter}
|
22
|
+
s.test_files = ["test/epub/test_container.rb", "test/epub/test_content.rb", "test/epub/test_toc.rb", "test/test_builder.rb", "test/test_fetcher.rb", "test/test_logger.rb", "test/test_parser.rb"]
|
23
|
+
|
24
|
+
if s.respond_to? :specification_version then
|
25
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
|
+
s.specification_version = 3
|
27
|
+
|
28
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
29
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.2"])
|
30
|
+
s.add_runtime_dependency(%q<builder>, [">= 2.1.2"])
|
31
|
+
s.add_runtime_dependency(%q<chardet>, [">= 0.9.0"])
|
32
|
+
s.add_runtime_dependency(%q<launchy>, [">= 0.3.3"])
|
33
|
+
s.add_development_dependency(%q<bones>, [">= 2.5.1"])
|
34
|
+
else
|
35
|
+
s.add_dependency(%q<nokogiri>, [">= 1.3.2"])
|
36
|
+
s.add_dependency(%q<builder>, [">= 2.1.2"])
|
37
|
+
s.add_dependency(%q<chardet>, [">= 0.9.0"])
|
38
|
+
s.add_dependency(%q<launchy>, [">= 0.3.3"])
|
39
|
+
s.add_dependency(%q<bones>, [">= 2.5.1"])
|
40
|
+
end
|
41
|
+
else
|
42
|
+
s.add_dependency(%q<nokogiri>, [">= 1.3.2"])
|
43
|
+
s.add_dependency(%q<builder>, [">= 2.1.2"])
|
44
|
+
s.add_dependency(%q<chardet>, [">= 0.9.0"])
|
45
|
+
s.add_dependency(%q<launchy>, [">= 0.3.3"])
|
46
|
+
s.add_dependency(%q<bones>, [">= 2.5.1"])
|
47
|
+
end
|
48
|
+
end
|
data/test/epub/test_container.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'rubygems'
|
3
|
-
require '
|
3
|
+
require 'nokogiri'
|
4
4
|
require 'repub/epub'
|
5
5
|
|
6
6
|
class TestContainer < Test::Unit::TestCase
|
7
7
|
def test_container_create
|
8
8
|
c = Repub::Epub::Container.new
|
9
9
|
s = c.to_xml
|
10
|
-
doc =
|
10
|
+
doc = Nokogiri::HTML(s)
|
11
11
|
#puts s
|
12
12
|
|
13
13
|
assert_not_nil(doc.search('rootfile'))
|
data/test/epub/test_content.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'rubygems'
|
3
|
-
require '
|
3
|
+
require 'nokogiri'
|
4
4
|
require 'repub/epub'
|
5
5
|
|
6
6
|
class TestContent < Test::Unit::TestCase
|
@@ -8,7 +8,7 @@ class TestContent < Test::Unit::TestCase
|
|
8
8
|
x = Repub::Epub::Content.new('some-name')
|
9
9
|
s = x.to_xml
|
10
10
|
#puts s
|
11
|
-
doc =
|
11
|
+
doc = Nokogiri::HTML(s)
|
12
12
|
|
13
13
|
# manifest was created
|
14
14
|
assert_not_nil(doc.search('manifest'))
|
@@ -35,7 +35,7 @@ class TestContent < Test::Unit::TestCase
|
|
35
35
|
x.add_document 'glossary.html', 'glossary'
|
36
36
|
s = x.to_xml
|
37
37
|
#puts s
|
38
|
-
doc =
|
38
|
+
doc = Nokogiri::HTML(s)
|
39
39
|
|
40
40
|
# manifest was created
|
41
41
|
assert_not_nil(doc.search('manifest'))
|
data/test/epub/test_toc.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'rubygems'
|
3
|
-
require '
|
3
|
+
require 'nokogiri'
|
4
4
|
require 'repub/epub'
|
5
5
|
|
6
6
|
class TestToc < Test::Unit::TestCase
|
@@ -8,7 +8,7 @@ class TestToc < Test::Unit::TestCase
|
|
8
8
|
x = Repub::Epub::Toc.new('some-name')
|
9
9
|
s = x.to_xml
|
10
10
|
#puts s
|
11
|
-
doc =
|
11
|
+
doc = Nokogiri::HTML(s)
|
12
12
|
# TODO
|
13
13
|
end
|
14
14
|
|
@@ -23,7 +23,7 @@ class TestToc < Test::Unit::TestCase
|
|
23
23
|
p12 = p1.add_nav_point('Chapter 1-2', 'chapter-1-2.html')
|
24
24
|
s = x.to_xml
|
25
25
|
#puts s
|
26
|
-
doc =
|
26
|
+
doc = Nokogiri::HTML(s)
|
27
27
|
# TODO
|
28
28
|
end
|
29
29
|
end
|
data/test/test_builder.rb
CHANGED
data/test/test_fetcher.rb
CHANGED
@@ -1,36 +1,36 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
require 'repub'
|
3
|
-
require 'repub/app'
|
4
|
-
|
5
|
-
class TestFetcher < Test::Unit::TestCase
|
6
|
-
|
7
|
-
include Repub::App::Fetcher
|
8
|
-
attr_reader :options
|
9
|
-
|
10
|
-
def test_fetcher
|
11
|
-
@options = {
|
12
|
-
:url => 'http://www.berzinarchives.com/web/x/prn/p.html_1614431902.html',
|
13
|
-
:helper => 'wget'
|
14
|
-
}
|
15
|
-
assert_nothing_raised do
|
16
|
-
cache = fetch
|
17
|
-
#p cache
|
18
|
-
assert_equal('http://www.berzinarchives.com/web/x/prn/p.html_1614431902.html', cache.url)
|
19
|
-
assert(cache.path.include?('.repub/cache/f963050ead9ee7775a4155e13743d47bc851d5d8'))
|
20
|
-
assert_equal('f963050ead9ee7775a4155e13743d47bc851d5d8', cache.name)
|
21
|
-
# assert(File.exist?(File.join(f.asset_root, f.asset_name)), "Fetch failed.")
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def test_fetcher_fail
|
26
|
-
@options = {
|
27
|
-
:url => 'not-existing',
|
28
|
-
:helper => 'wget'
|
29
|
-
}
|
30
|
-
assert_raise(Repub::App::FetcherException) do
|
31
|
-
cache = fetch
|
32
|
-
#p cache
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
end
|
1
|
+
require 'test/unit'
|
2
|
+
require 'repub'
|
3
|
+
require 'repub/app'
|
4
|
+
|
5
|
+
class TestFetcher < Test::Unit::TestCase
|
6
|
+
|
7
|
+
include Repub::App::Fetcher
|
8
|
+
attr_reader :options
|
9
|
+
|
10
|
+
def test_fetcher
|
11
|
+
@options = {
|
12
|
+
:url => 'http://www.berzinarchives.com/web/x/prn/p.html_1614431902.html',
|
13
|
+
:helper => 'wget'
|
14
|
+
}
|
15
|
+
assert_nothing_raised do
|
16
|
+
cache = fetch
|
17
|
+
#p cache
|
18
|
+
assert_equal('http://www.berzinarchives.com/web/x/prn/p.html_1614431902.html', cache.url)
|
19
|
+
assert(cache.path.include?('.repub/cache/f963050ead9ee7775a4155e13743d47bc851d5d8'))
|
20
|
+
assert_equal('f963050ead9ee7775a4155e13743d47bc851d5d8', cache.name)
|
21
|
+
# assert(File.exist?(File.join(f.asset_root, f.asset_name)), "Fetch failed.")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_fetcher_fail
|
26
|
+
@options = {
|
27
|
+
:url => 'not-existing',
|
28
|
+
:helper => 'wget'
|
29
|
+
}
|
30
|
+
assert_raise(Repub::App::FetcherException) do
|
31
|
+
cache = fetch
|
32
|
+
#p cache
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: invisiblellama-repub
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitri Goutnik
|
@@ -9,28 +9,28 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-06-
|
12
|
+
date: 2009-06-28 00:00:00 -07:00
|
13
13
|
default_executable: repub
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.3.2
|
24
24
|
version:
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
|
-
name:
|
26
|
+
name: builder
|
27
27
|
type: :runtime
|
28
28
|
version_requirement:
|
29
29
|
version_requirements: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 2.1.2
|
34
34
|
version:
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
36
|
name: chardet
|
@@ -62,7 +62,7 @@ dependencies:
|
|
62
62
|
- !ruby/object:Gem::Version
|
63
63
|
version: 2.5.1
|
64
64
|
version:
|
65
|
-
description:
|
65
|
+
description: Simple HTML to ePub converter.
|
66
66
|
email: dg@invisiblellama.net
|
67
67
|
executables:
|
68
68
|
- repub
|
@@ -71,15 +71,14 @@ extensions: []
|
|
71
71
|
extra_rdoc_files:
|
72
72
|
- History.txt
|
73
73
|
- README.txt
|
74
|
-
-
|
74
|
+
- SAMPLES.txt
|
75
75
|
- bin/repub
|
76
|
-
- lib/repub/mobi/.githidden
|
77
76
|
files:
|
78
|
-
- .gitignore
|
79
77
|
- History.txt
|
80
78
|
- README.txt
|
81
79
|
- Rakefile
|
82
|
-
-
|
80
|
+
- SAMPLES.txt
|
81
|
+
- TODO
|
83
82
|
- bin/repub
|
84
83
|
- lib/repub.rb
|
85
84
|
- lib/repub/app.rb
|
@@ -94,7 +93,7 @@ files:
|
|
94
93
|
- lib/repub/epub/container.rb
|
95
94
|
- lib/repub/epub/content.rb
|
96
95
|
- lib/repub/epub/toc.rb
|
97
|
-
-
|
96
|
+
- repub.gemspec
|
98
97
|
- test/epub/test_container.rb
|
99
98
|
- test/epub/test_content.rb
|
100
99
|
- test/epub/test_toc.rb
|
@@ -128,7 +127,7 @@ rubyforge_project: repub
|
|
128
127
|
rubygems_version: 1.2.0
|
129
128
|
signing_key:
|
130
129
|
specification_version: 3
|
131
|
-
summary:
|
130
|
+
summary: Simple HTML to ePub converter
|
132
131
|
test_files:
|
133
132
|
- test/epub/test_container.rb
|
134
133
|
- test/epub/test_content.rb
|
data/.gitignore
DELETED
data/lib/repub/mobi/.githidden
DELETED
File without changes
|