repub 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +11 -0
- data/README.rdoc +14 -8
- data/TODO +0 -2
- data/lib/repub.rb +1 -1
- data/lib/repub/app.rb +3 -0
- data/lib/repub/app/builder.rb +151 -154
- data/lib/repub/app/fetcher.rb +10 -23
- data/lib/repub/app/filter.rb +30 -0
- data/lib/repub/app/options.rb +0 -6
- data/lib/repub/app/parser.rb +63 -73
- data/lib/repub/app/post_filters.rb +135 -0
- data/lib/repub/app/pre_filters.rb +50 -0
- data/lib/repub/app/profile.rb +1 -1
- data/lib/repub/epub.rb +4 -3
- data/lib/repub/epub/container_item.rb +49 -0
- data/lib/repub/epub/{toc.rb → ncx.rb} +137 -139
- data/lib/repub/epub/ocf.rb +62 -0
- data/lib/repub/epub/opf.rb +136 -0
- data/repub.gemspec +4 -4
- data/test/epub/{test_toc.rb → test_ncx.rb} +14 -12
- data/test/epub/test_ocf.rb +28 -0
- data/test/epub/{test_content.rb → test_opf.rb} +25 -19
- data/test/test_filter.rb +28 -0
- data/test/test_parser.rb +3 -4
- metadata +17 -11
- data/lib/repub/epub/container.rb +0 -28
- data/lib/repub/epub/content.rb +0 -178
- data/test/epub/test_container.rb +0 -15
data/lib/repub/app/fetcher.rb
CHANGED
@@ -4,7 +4,7 @@ require 'uri'
|
|
4
4
|
require 'iconv'
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
|
-
#
|
7
|
+
# Disable warnings from chardet
|
8
8
|
old_verbose = $VERBOSE
|
9
9
|
$VERBOSE = false
|
10
10
|
require 'UniversalDetector'
|
@@ -17,7 +17,7 @@ module Repub
|
|
17
17
|
class FetcherException < RuntimeError; end
|
18
18
|
|
19
19
|
def fetch
|
20
|
-
|
20
|
+
FetcherSupport.new(options).fetch
|
21
21
|
end
|
22
22
|
|
23
23
|
AssetTypes = {
|
@@ -26,7 +26,7 @@ module Repub
|
|
26
26
|
:images => %w[jpg jpeg png gif svg]
|
27
27
|
}
|
28
28
|
|
29
|
-
class
|
29
|
+
class FetcherSupport
|
30
30
|
include Logger
|
31
31
|
|
32
32
|
Downloaders = {
|
@@ -63,34 +63,21 @@ module Repub
|
|
63
63
|
raise FetcherException, "Fetch failed."
|
64
64
|
end
|
65
65
|
unless cache.cached?
|
66
|
-
|
67
|
-
fix_encoding(cache, @options[:encoding])
|
66
|
+
preprocess cache
|
68
67
|
end
|
69
68
|
end
|
70
69
|
end
|
71
70
|
|
72
71
|
private
|
73
72
|
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
cache.assets[:documents].each do |doc|
|
80
|
-
unless encoding
|
81
|
-
log.info "Detecting encoding for #{doc}"
|
82
|
-
s = IO.read(doc)
|
83
|
-
raise FetcherException, "empty document" unless s
|
84
|
-
encoding = UniversalDetector.chardet(s)['encoding']
|
85
|
-
end
|
86
|
-
if encoding.downcase != 'utf-8'
|
87
|
-
log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
|
88
|
-
s = Iconv.conv('utf-8', encoding, IO.read(doc))
|
89
|
-
File.open(doc, 'w') { |f| f.write(s) }
|
90
|
-
end
|
73
|
+
def preprocess(cache)
|
74
|
+
cache.assets[:documents].each do |file|
|
75
|
+
log.info "Preprocessing #{file}"
|
76
|
+
s = PreFilters.apply_filters(IO.read(file), @options)
|
77
|
+
File.open(file, 'w') { |f| f.write(s) }
|
91
78
|
end
|
92
79
|
end
|
93
|
-
|
80
|
+
|
94
81
|
def which(cmd)
|
95
82
|
if !RUBY_PLATFORM.match('mswin')
|
96
83
|
cmd = `/usr/bin/which #{cmd}`.strip
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Repub
|
2
|
+
class App
|
3
|
+
module Filter
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
(class << base; self; end).instance_eval do
|
7
|
+
define_method(:filter) do |name, &block|
|
8
|
+
@filters ||= []
|
9
|
+
@filters << {:name => name, :proc => Proc.new(&block) }
|
10
|
+
end
|
11
|
+
attr_reader :filters
|
12
|
+
attr_reader :options
|
13
|
+
end
|
14
|
+
base.extend(ClassMethods)
|
15
|
+
base.extend(Logger)
|
16
|
+
end
|
17
|
+
|
18
|
+
def options
|
19
|
+
self.class.options
|
20
|
+
end
|
21
|
+
|
22
|
+
module ClassMethods
|
23
|
+
def apply_filters(input, options = nil)
|
24
|
+
@options = options
|
25
|
+
@filters.inject(input) { |input, filter| filter[:proc].call(input) }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/repub/app/options.rb
CHANGED
@@ -17,7 +17,6 @@ module Repub
|
|
17
17
|
:browser => false,
|
18
18
|
:css => nil,
|
19
19
|
:encoding => nil,
|
20
|
-
:fixup => true,
|
21
20
|
:helper => 'wget',
|
22
21
|
:metadata => {},
|
23
22
|
:output_path => Dir.getwd,
|
@@ -119,11 +118,6 @@ module Repub
|
|
119
118
|
options[:metadata][name.to_sym] = value
|
120
119
|
end
|
121
120
|
|
122
|
-
opts.on("-F", "--no-fixup",
|
123
|
-
"Do not attempt to make document meet XHTML 1.0 Strict.",
|
124
|
-
"Default is to try and fix things that are broken. "
|
125
|
-
) { |value| options[:fixup] = false }
|
126
|
-
|
127
121
|
opts.on("-e", "--encoding NAME", String,
|
128
122
|
"Set source document encoding. Default is to autodetect."
|
129
123
|
) { |value| options[:encoding] = value }
|
data/lib/repub/app/parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'nokogiri'
|
3
|
+
require 'repub/epub'
|
3
4
|
|
4
5
|
module Repub
|
5
6
|
class App
|
@@ -11,7 +12,7 @@ module Repub
|
|
11
12
|
Parser.new(options).parse(cache)
|
12
13
|
end
|
13
14
|
|
14
|
-
# Default selectors
|
15
|
+
# Default selectors, some reasonable values
|
15
16
|
#
|
16
17
|
Selectors = {
|
17
18
|
:title => '//h1',
|
@@ -26,37 +27,36 @@ module Repub
|
|
26
27
|
attr_reader :cache
|
27
28
|
attr_reader :uid
|
28
29
|
attr_reader :title
|
29
|
-
attr_reader :title_html
|
30
30
|
attr_reader :toc
|
31
31
|
|
32
32
|
def initialize(options)
|
33
33
|
@selectors = options[:selectors] || Selectors
|
34
|
-
@fixup = options[:fixup]
|
35
34
|
end
|
36
35
|
|
36
|
+
# Parse downloaded asset cache
|
37
|
+
#
|
37
38
|
def parse(cache)
|
38
39
|
raise ParserException, "No HTML document found" if
|
39
40
|
cache.assets[:documents].empty?
|
41
|
+
# TODO: limited to a single document only
|
40
42
|
raise ParserException, "More than one HTML document found, this is not supported (yet)" if
|
41
43
|
cache.assets[:documents].size > 1
|
42
44
|
|
43
45
|
@cache = cache
|
44
|
-
@
|
45
|
-
log.debug "-- Parsing #{@
|
46
|
-
@doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @
|
46
|
+
@document = @cache.assets[:documents][0]
|
47
|
+
log.debug "-- Parsing #{@document}"
|
48
|
+
@doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
|
47
49
|
|
48
50
|
@uid = @cache.name
|
49
51
|
parse_title
|
50
|
-
parse_title_html
|
51
52
|
parse_toc
|
52
|
-
|
53
53
|
self
|
54
54
|
end
|
55
55
|
|
56
56
|
private
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
|
58
|
+
# Parse document title
|
59
|
+
#
|
60
60
|
def parse_title
|
61
61
|
log.debug "-- Looking for title with #{@selectors[:title]}"
|
62
62
|
el = @doc.at(@selectors[:title])
|
@@ -69,82 +69,72 @@ module Repub
|
|
69
69
|
@title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
70
70
|
log.info "Found title \"#{@title}\""
|
71
71
|
else
|
72
|
-
@title =
|
72
|
+
@title = 'Untitled'
|
73
73
|
log.warn "** Could not find document title, using '#{@title}'"
|
74
74
|
end
|
75
75
|
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
el = @doc.at(@selectors[:title])
|
80
|
-
@title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
|
81
|
-
end
|
82
|
-
|
83
|
-
# Helper container for TOC items
|
76
|
+
|
77
|
+
# Parsed TOC item container
|
78
|
+
# Inherit from NavPoint to avoid conversions later in Builder
|
84
79
|
#
|
85
|
-
class TocItem <
|
86
|
-
:title,
|
87
|
-
:uri,
|
88
|
-
:fragment_id
|
89
|
-
)
|
80
|
+
class TocItem < Repub::Epub::NCX::NavPoint
|
90
81
|
|
91
|
-
def initialize(title, uri_with_fragment_id, subitems,
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
@subitems = subitems || []
|
82
|
+
def initialize(title, uri_with_fragment_id, subitems, document)
|
83
|
+
uri, fragment_id = uri_with_fragment_id.split(/#/)
|
84
|
+
uri = document if uri.empty?
|
85
|
+
super(title, "#{uri}##{fragment_id}", subitems)
|
96
86
|
end
|
97
|
-
|
98
|
-
attr_reader :subitems
|
99
|
-
|
100
|
-
def src
|
101
|
-
"#{uri}##{fragment_id}"
|
102
|
-
end
|
103
|
-
end
|
104
87
|
|
88
|
+
end
|
89
|
+
|
90
|
+
# Look for TOC and recursively parse it
|
91
|
+
#
|
105
92
|
def parse_toc
|
93
|
+
@toc = []
|
94
|
+
depth = 0
|
95
|
+
|
96
|
+
l = lambda do |section|
|
97
|
+
toc_items = []
|
98
|
+
depth += 1
|
99
|
+
section.xpath(@selectors[:toc_item]).each do |item|
|
100
|
+
# Get item's anchor and href
|
101
|
+
a = item.name == 'a' ? item : item.at('a')
|
102
|
+
next if !a
|
103
|
+
href = a['href']
|
104
|
+
next if !href
|
105
|
+
|
106
|
+
# Is this a leaf item or node? Title parsing depends on that.
|
107
|
+
subsection = item.xpath(@selectors[:toc_section]).first
|
108
|
+
if subsection
|
109
|
+
# Item has subsection, use anchor text for title
|
110
|
+
title = a.inner_text
|
111
|
+
else
|
112
|
+
# Leaf item, it is safe to glue inner_text from all children
|
113
|
+
title = item.children.map{|c| c.inner_text }.join(' ')
|
114
|
+
end
|
115
|
+
title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
116
|
+
log.debug "-- #{" " * depth}#{title}"
|
117
|
+
|
118
|
+
# Parse subsection
|
119
|
+
subitems = l.call(subsection) if subsection
|
120
|
+
|
121
|
+
toc_items << TocItem.new(title, href, subitems, @document)
|
122
|
+
end
|
123
|
+
depth -= 1
|
124
|
+
toc_items
|
125
|
+
end
|
126
|
+
|
106
127
|
log.debug "-- Looking for TOC with #{@selectors[:toc]}"
|
107
|
-
|
108
|
-
|
109
|
-
|
128
|
+
toc_element = @doc.xpath(@selectors[:toc]).first
|
129
|
+
|
130
|
+
if toc_element
|
131
|
+
log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
|
132
|
+
@toc = l.call(toc_element)
|
110
133
|
log.info "Found TOC with #{@toc.size} top-level items"
|
111
134
|
else
|
112
|
-
@toc = []
|
113
135
|
log.warn "** Could not find document table of contents"
|
114
136
|
end
|
115
137
|
end
|
116
|
-
|
117
|
-
def parse_toc_section(section)
|
118
|
-
toc = []
|
119
|
-
log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
|
120
|
-
section.xpath(@selectors[:toc_item]).each do |item|
|
121
|
-
# Get item's anchor and href
|
122
|
-
a = item.name == 'a' ? item : item.at('a')
|
123
|
-
next if !a
|
124
|
-
href = a['href']
|
125
|
-
next if !href
|
126
|
-
# Is this a leaf item or node ?
|
127
|
-
subsection = item.xpath(@selectors[:toc_section]).first
|
128
|
-
if subsection
|
129
|
-
# Item has subsection, use anchor text for title
|
130
|
-
title = a.inner_text
|
131
|
-
else
|
132
|
-
# Leaf item, glue inner_text from all children
|
133
|
-
title = item.children.map{|c| c.inner_text }.join(' ')
|
134
|
-
end
|
135
|
-
title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
136
|
-
log.debug "-- Found item: #{title}"
|
137
|
-
# Parse sub-section
|
138
|
-
if subsection
|
139
|
-
log.debug "-- Found section with #{@selectors[:toc_section]}"
|
140
|
-
log.debug "-- >"
|
141
|
-
subitems = parse_toc_section(subsection)
|
142
|
-
log.debug '-- .'
|
143
|
-
end
|
144
|
-
toc << TocItem.new(title, href, subitems, @asset)
|
145
|
-
end
|
146
|
-
toc
|
147
|
-
end
|
148
138
|
end
|
149
139
|
|
150
140
|
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'repub/app/filter'
|
2
|
+
|
3
|
+
module Repub
|
4
|
+
class App
|
5
|
+
class PostFilters
|
6
|
+
|
7
|
+
class FileFilters
|
8
|
+
include Filter
|
9
|
+
|
10
|
+
# Do rx substitutions
|
11
|
+
#
|
12
|
+
filter :do_rxes do |s|
|
13
|
+
options[:rx].each do |rx|
|
14
|
+
rx.strip!
|
15
|
+
delimiter = rx[0, 1]
|
16
|
+
rx = rx.gsub(/\\#{delimiter}/, "\n")
|
17
|
+
ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
|
18
|
+
raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
|
19
|
+
pattern = ra[0]
|
20
|
+
replacement = ra[1] || ''
|
21
|
+
log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
|
22
|
+
s.gsub!(Regexp.new(pattern), replacement)
|
23
|
+
end if options[:rx]
|
24
|
+
s
|
25
|
+
end
|
26
|
+
|
27
|
+
# Remove xml preamble if any
|
28
|
+
#
|
29
|
+
filter :fix_xml_preamble do |s|
|
30
|
+
preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
|
31
|
+
if s =~ preamble_rx
|
32
|
+
log.debug "-- Removing xml preamble"
|
33
|
+
s.sub!(preamble_rx, '')
|
34
|
+
end
|
35
|
+
s
|
36
|
+
end
|
37
|
+
|
38
|
+
# Replace doctype
|
39
|
+
#
|
40
|
+
filter :fix_doctype do |s|
|
41
|
+
doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
|
42
|
+
if s =~ doctype_rx
|
43
|
+
s.sub!(doctype_rx, '')
|
44
|
+
end
|
45
|
+
log.debug "-- Replacing doctype"
|
46
|
+
s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
|
47
|
+
s
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class DocumentFilters
|
52
|
+
include Filter
|
53
|
+
|
54
|
+
# Set Content-Type charset to UTF-8
|
55
|
+
#
|
56
|
+
filter :fix_content_type do |doc|
|
57
|
+
doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
|
58
|
+
el['content'] = 'text/html; charset=utf-8'
|
59
|
+
end
|
60
|
+
doc
|
61
|
+
end
|
62
|
+
|
63
|
+
# Process styles
|
64
|
+
#
|
65
|
+
filter :fix_styles do |doc|
|
66
|
+
if options[:css] && !options[:css].empty?
|
67
|
+
# Remove all stylesheet links
|
68
|
+
doc.xpath('//head/link[@rel="stylesheet"]').remove
|
69
|
+
if options[:css] == '-'
|
70
|
+
# Also remove all inline styles
|
71
|
+
doc.xpath('//head/style').remove
|
72
|
+
log.info "Removing all stylesheet links and style elements"
|
73
|
+
else
|
74
|
+
# Add custom stylesheet link
|
75
|
+
link = Nokogiri::XML::Node.new('link', doc)
|
76
|
+
link['rel'] = 'stylesheet'
|
77
|
+
link['type'] = 'text/css'
|
78
|
+
link['href'] = File.basename(@options[:css])
|
79
|
+
# Add as the last child so it has precedence over (possible) inline styles before
|
80
|
+
doc.at('//head').add_child(link)
|
81
|
+
log.info "Replacing CSS refs with \"#{link['href']}\""
|
82
|
+
end
|
83
|
+
end
|
84
|
+
doc
|
85
|
+
end
|
86
|
+
|
87
|
+
# Insert elements after/before selector
|
88
|
+
#
|
89
|
+
filter :do_inserts do |doc|
|
90
|
+
options[:after].each do |e|
|
91
|
+
selector = e.keys.first
|
92
|
+
fragment = e[selector]
|
93
|
+
element = doc.xpath(selector).first
|
94
|
+
if element
|
95
|
+
log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
|
96
|
+
fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
|
97
|
+
end
|
98
|
+
end if options[:after]
|
99
|
+
options[:before].each do |e|
|
100
|
+
selector = e.keys.first
|
101
|
+
fragment = e[selector]
|
102
|
+
element = doc.xpath(selector).first
|
103
|
+
if element
|
104
|
+
log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
|
105
|
+
fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
|
106
|
+
end
|
107
|
+
end if options[:before]
|
108
|
+
doc
|
109
|
+
end
|
110
|
+
|
111
|
+
# Remove elements
|
112
|
+
#
|
113
|
+
filter :do_removes do |doc|
|
114
|
+
options[:remove].each do |selector|
|
115
|
+
log.info "Removing elements \"#{selector}\""
|
116
|
+
doc.search(selector).remove
|
117
|
+
end if options[:remove]
|
118
|
+
doc
|
119
|
+
end
|
120
|
+
|
121
|
+
# TODO: XHTML requires a to have embedding element
|
122
|
+
# filter :wrap_anchors do |doc|
|
123
|
+
# log.info "Wrapping anchors"
|
124
|
+
# doc.xpath('//body/a').each do |a|
|
125
|
+
# wrapper = Nokogiri::XML::Node.new('p', doc)
|
126
|
+
# a.add_next_sibling(wrapper)
|
127
|
+
# wrapper << a
|
128
|
+
# end
|
129
|
+
# doc
|
130
|
+
# end
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|