invisiblellama-repub 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +11 -0
- data/README.rdoc +14 -8
- data/TODO +0 -2
- data/lib/repub.rb +1 -1
- data/lib/repub/app.rb +3 -0
- data/lib/repub/app/builder.rb +151 -154
- data/lib/repub/app/fetcher.rb +10 -23
- data/lib/repub/app/filter.rb +30 -0
- data/lib/repub/app/options.rb +0 -6
- data/lib/repub/app/parser.rb +63 -73
- data/lib/repub/app/post_filters.rb +135 -0
- data/lib/repub/app/pre_filters.rb +50 -0
- data/lib/repub/app/profile.rb +1 -1
- data/lib/repub/epub.rb +4 -3
- data/lib/repub/epub/container_item.rb +49 -0
- data/lib/repub/epub/{toc.rb → ncx.rb} +137 -139
- data/lib/repub/epub/ocf.rb +62 -0
- data/lib/repub/epub/opf.rb +136 -0
- data/repub.gemspec +4 -4
- data/test/epub/{test_toc.rb → test_ncx.rb} +14 -12
- data/test/epub/test_ocf.rb +28 -0
- data/test/epub/{test_content.rb → test_opf.rb} +25 -19
- data/test/test_filter.rb +28 -0
- data/test/test_parser.rb +3 -4
- metadata +17 -11
- data/lib/repub/epub/container.rb +0 -28
- data/lib/repub/epub/content.rb +0 -178
- data/test/epub/test_container.rb +0 -15
data/lib/repub/app/fetcher.rb
CHANGED
@@ -4,7 +4,7 @@ require 'uri'
|
|
4
4
|
require 'iconv'
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
|
-
#
|
7
|
+
# Disable warnings from chardet
|
8
8
|
old_verbose = $VERBOSE
|
9
9
|
$VERBOSE = false
|
10
10
|
require 'UniversalDetector'
|
@@ -17,7 +17,7 @@ module Repub
|
|
17
17
|
class FetcherException < RuntimeError; end
|
18
18
|
|
19
19
|
def fetch
|
20
|
-
|
20
|
+
FetcherSupport.new(options).fetch
|
21
21
|
end
|
22
22
|
|
23
23
|
AssetTypes = {
|
@@ -26,7 +26,7 @@ module Repub
|
|
26
26
|
:images => %w[jpg jpeg png gif svg]
|
27
27
|
}
|
28
28
|
|
29
|
-
class
|
29
|
+
class FetcherSupport
|
30
30
|
include Logger
|
31
31
|
|
32
32
|
Downloaders = {
|
@@ -63,34 +63,21 @@ module Repub
|
|
63
63
|
raise FetcherException, "Fetch failed."
|
64
64
|
end
|
65
65
|
unless cache.cached?
|
66
|
-
|
67
|
-
fix_encoding(cache, @options[:encoding])
|
66
|
+
preprocess cache
|
68
67
|
end
|
69
68
|
end
|
70
69
|
end
|
71
70
|
|
72
71
|
private
|
73
72
|
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
cache.assets[:documents].each do |doc|
|
80
|
-
unless encoding
|
81
|
-
log.info "Detecting encoding for #{doc}"
|
82
|
-
s = IO.read(doc)
|
83
|
-
raise FetcherException, "empty document" unless s
|
84
|
-
encoding = UniversalDetector.chardet(s)['encoding']
|
85
|
-
end
|
86
|
-
if encoding.downcase != 'utf-8'
|
87
|
-
log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
|
88
|
-
s = Iconv.conv('utf-8', encoding, IO.read(doc))
|
89
|
-
File.open(doc, 'w') { |f| f.write(s) }
|
90
|
-
end
|
73
|
+
def preprocess(cache)
|
74
|
+
cache.assets[:documents].each do |file|
|
75
|
+
log.info "Preprocessing #{file}"
|
76
|
+
s = PreFilters.apply_filters(IO.read(file), @options)
|
77
|
+
File.open(file, 'w') { |f| f.write(s) }
|
91
78
|
end
|
92
79
|
end
|
93
|
-
|
80
|
+
|
94
81
|
def which(cmd)
|
95
82
|
if !RUBY_PLATFORM.match('mswin')
|
96
83
|
cmd = `/usr/bin/which #{cmd}`.strip
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Repub
|
2
|
+
class App
|
3
|
+
module Filter
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
(class << base; self; end).instance_eval do
|
7
|
+
define_method(:filter) do |name, &block|
|
8
|
+
@filters ||= []
|
9
|
+
@filters << {:name => name, :proc => Proc.new(&block) }
|
10
|
+
end
|
11
|
+
attr_reader :filters
|
12
|
+
attr_reader :options
|
13
|
+
end
|
14
|
+
base.extend(ClassMethods)
|
15
|
+
base.extend(Logger)
|
16
|
+
end
|
17
|
+
|
18
|
+
def options
|
19
|
+
self.class.options
|
20
|
+
end
|
21
|
+
|
22
|
+
module ClassMethods
|
23
|
+
def apply_filters(input, options = nil)
|
24
|
+
@options = options
|
25
|
+
@filters.inject(input) { |input, filter| filter[:proc].call(input) }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/repub/app/options.rb
CHANGED
@@ -17,7 +17,6 @@ module Repub
|
|
17
17
|
:browser => false,
|
18
18
|
:css => nil,
|
19
19
|
:encoding => nil,
|
20
|
-
:fixup => true,
|
21
20
|
:helper => 'wget',
|
22
21
|
:metadata => {},
|
23
22
|
:output_path => Dir.getwd,
|
@@ -119,11 +118,6 @@ module Repub
|
|
119
118
|
options[:metadata][name.to_sym] = value
|
120
119
|
end
|
121
120
|
|
122
|
-
opts.on("-F", "--no-fixup",
|
123
|
-
"Do not attempt to make document meet XHTML 1.0 Strict.",
|
124
|
-
"Default is to try and fix things that are broken. "
|
125
|
-
) { |value| options[:fixup] = false }
|
126
|
-
|
127
121
|
opts.on("-e", "--encoding NAME", String,
|
128
122
|
"Set source document encoding. Default is to autodetect."
|
129
123
|
) { |value| options[:encoding] = value }
|
data/lib/repub/app/parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'nokogiri'
|
3
|
+
require 'repub/epub'
|
3
4
|
|
4
5
|
module Repub
|
5
6
|
class App
|
@@ -11,7 +12,7 @@ module Repub
|
|
11
12
|
Parser.new(options).parse(cache)
|
12
13
|
end
|
13
14
|
|
14
|
-
# Default selectors
|
15
|
+
# Default selectors, some reasonable values
|
15
16
|
#
|
16
17
|
Selectors = {
|
17
18
|
:title => '//h1',
|
@@ -26,37 +27,36 @@ module Repub
|
|
26
27
|
attr_reader :cache
|
27
28
|
attr_reader :uid
|
28
29
|
attr_reader :title
|
29
|
-
attr_reader :title_html
|
30
30
|
attr_reader :toc
|
31
31
|
|
32
32
|
def initialize(options)
|
33
33
|
@selectors = options[:selectors] || Selectors
|
34
|
-
@fixup = options[:fixup]
|
35
34
|
end
|
36
35
|
|
36
|
+
# Parse downloaded asset cache
|
37
|
+
#
|
37
38
|
def parse(cache)
|
38
39
|
raise ParserException, "No HTML document found" if
|
39
40
|
cache.assets[:documents].empty?
|
41
|
+
# TODO: limited to a single document only
|
40
42
|
raise ParserException, "More than one HTML document found, this is not supported (yet)" if
|
41
43
|
cache.assets[:documents].size > 1
|
42
44
|
|
43
45
|
@cache = cache
|
44
|
-
@
|
45
|
-
log.debug "-- Parsing #{@
|
46
|
-
@doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @
|
46
|
+
@document = @cache.assets[:documents][0]
|
47
|
+
log.debug "-- Parsing #{@document}"
|
48
|
+
@doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
|
47
49
|
|
48
50
|
@uid = @cache.name
|
49
51
|
parse_title
|
50
|
-
parse_title_html
|
51
52
|
parse_toc
|
52
|
-
|
53
53
|
self
|
54
54
|
end
|
55
55
|
|
56
56
|
private
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
|
58
|
+
# Parse document title
|
59
|
+
#
|
60
60
|
def parse_title
|
61
61
|
log.debug "-- Looking for title with #{@selectors[:title]}"
|
62
62
|
el = @doc.at(@selectors[:title])
|
@@ -69,82 +69,72 @@ module Repub
|
|
69
69
|
@title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
70
70
|
log.info "Found title \"#{@title}\""
|
71
71
|
else
|
72
|
-
@title =
|
72
|
+
@title = 'Untitled'
|
73
73
|
log.warn "** Could not find document title, using '#{@title}'"
|
74
74
|
end
|
75
75
|
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
el = @doc.at(@selectors[:title])
|
80
|
-
@title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
|
81
|
-
end
|
82
|
-
|
83
|
-
# Helper container for TOC items
|
76
|
+
|
77
|
+
# Parsed TOC item container
|
78
|
+
# Inherit from NavPoint to avoid conversions later in Builder
|
84
79
|
#
|
85
|
-
class TocItem <
|
86
|
-
:title,
|
87
|
-
:uri,
|
88
|
-
:fragment_id
|
89
|
-
)
|
80
|
+
class TocItem < Repub::Epub::NCX::NavPoint
|
90
81
|
|
91
|
-
def initialize(title, uri_with_fragment_id, subitems,
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
@subitems = subitems || []
|
82
|
+
def initialize(title, uri_with_fragment_id, subitems, document)
|
83
|
+
uri, fragment_id = uri_with_fragment_id.split(/#/)
|
84
|
+
uri = document if uri.empty?
|
85
|
+
super(title, "#{uri}##{fragment_id}", subitems)
|
96
86
|
end
|
97
|
-
|
98
|
-
attr_reader :subitems
|
99
|
-
|
100
|
-
def src
|
101
|
-
"#{uri}##{fragment_id}"
|
102
|
-
end
|
103
|
-
end
|
104
87
|
|
88
|
+
end
|
89
|
+
|
90
|
+
# Look for TOC and recursively parse it
|
91
|
+
#
|
105
92
|
def parse_toc
|
93
|
+
@toc = []
|
94
|
+
depth = 0
|
95
|
+
|
96
|
+
l = lambda do |section|
|
97
|
+
toc_items = []
|
98
|
+
depth += 1
|
99
|
+
section.xpath(@selectors[:toc_item]).each do |item|
|
100
|
+
# Get item's anchor and href
|
101
|
+
a = item.name == 'a' ? item : item.at('a')
|
102
|
+
next if !a
|
103
|
+
href = a['href']
|
104
|
+
next if !href
|
105
|
+
|
106
|
+
# Is this a leaf item or node? Title parsing depends on that.
|
107
|
+
subsection = item.xpath(@selectors[:toc_section]).first
|
108
|
+
if subsection
|
109
|
+
# Item has subsection, use anchor text for title
|
110
|
+
title = a.inner_text
|
111
|
+
else
|
112
|
+
# Leaf item, it is safe to glue inner_text from all children
|
113
|
+
title = item.children.map{|c| c.inner_text }.join(' ')
|
114
|
+
end
|
115
|
+
title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
116
|
+
log.debug "-- #{" " * depth}#{title}"
|
117
|
+
|
118
|
+
# Parse subsection
|
119
|
+
subitems = l.call(subsection) if subsection
|
120
|
+
|
121
|
+
toc_items << TocItem.new(title, href, subitems, @document)
|
122
|
+
end
|
123
|
+
depth -= 1
|
124
|
+
toc_items
|
125
|
+
end
|
126
|
+
|
106
127
|
log.debug "-- Looking for TOC with #{@selectors[:toc]}"
|
107
|
-
|
108
|
-
|
109
|
-
|
128
|
+
toc_element = @doc.xpath(@selectors[:toc]).first
|
129
|
+
|
130
|
+
if toc_element
|
131
|
+
log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
|
132
|
+
@toc = l.call(toc_element)
|
110
133
|
log.info "Found TOC with #{@toc.size} top-level items"
|
111
134
|
else
|
112
|
-
@toc = []
|
113
135
|
log.warn "** Could not find document table of contents"
|
114
136
|
end
|
115
137
|
end
|
116
|
-
|
117
|
-
def parse_toc_section(section)
|
118
|
-
toc = []
|
119
|
-
log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
|
120
|
-
section.xpath(@selectors[:toc_item]).each do |item|
|
121
|
-
# Get item's anchor and href
|
122
|
-
a = item.name == 'a' ? item : item.at('a')
|
123
|
-
next if !a
|
124
|
-
href = a['href']
|
125
|
-
next if !href
|
126
|
-
# Is this a leaf item or node ?
|
127
|
-
subsection = item.xpath(@selectors[:toc_section]).first
|
128
|
-
if subsection
|
129
|
-
# Item has subsection, use anchor text for title
|
130
|
-
title = a.inner_text
|
131
|
-
else
|
132
|
-
# Leaf item, glue inner_text from all children
|
133
|
-
title = item.children.map{|c| c.inner_text }.join(' ')
|
134
|
-
end
|
135
|
-
title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
|
136
|
-
log.debug "-- Found item: #{title}"
|
137
|
-
# Parse sub-section
|
138
|
-
if subsection
|
139
|
-
log.debug "-- Found section with #{@selectors[:toc_section]}"
|
140
|
-
log.debug "-- >"
|
141
|
-
subitems = parse_toc_section(subsection)
|
142
|
-
log.debug '-- .'
|
143
|
-
end
|
144
|
-
toc << TocItem.new(title, href, subitems, @asset)
|
145
|
-
end
|
146
|
-
toc
|
147
|
-
end
|
148
138
|
end
|
149
139
|
|
150
140
|
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'repub/app/filter'
|
2
|
+
|
3
|
+
module Repub
|
4
|
+
class App
|
5
|
+
class PostFilters
|
6
|
+
|
7
|
+
class FileFilters
|
8
|
+
include Filter
|
9
|
+
|
10
|
+
# Do rx substitutions
|
11
|
+
#
|
12
|
+
filter :do_rxes do |s|
|
13
|
+
options[:rx].each do |rx|
|
14
|
+
rx.strip!
|
15
|
+
delimiter = rx[0, 1]
|
16
|
+
rx = rx.gsub(/\\#{delimiter}/, "\n")
|
17
|
+
ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
|
18
|
+
raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
|
19
|
+
pattern = ra[0]
|
20
|
+
replacement = ra[1] || ''
|
21
|
+
log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
|
22
|
+
s.gsub!(Regexp.new(pattern), replacement)
|
23
|
+
end if options[:rx]
|
24
|
+
s
|
25
|
+
end
|
26
|
+
|
27
|
+
# Remove xml preamble if any
|
28
|
+
#
|
29
|
+
filter :fix_xml_preamble do |s|
|
30
|
+
preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
|
31
|
+
if s =~ preamble_rx
|
32
|
+
log.debug "-- Removing xml preamble"
|
33
|
+
s.sub!(preamble_rx, '')
|
34
|
+
end
|
35
|
+
s
|
36
|
+
end
|
37
|
+
|
38
|
+
# Replace doctype
|
39
|
+
#
|
40
|
+
filter :fix_doctype do |s|
|
41
|
+
doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
|
42
|
+
if s =~ doctype_rx
|
43
|
+
s.sub!(doctype_rx, '')
|
44
|
+
end
|
45
|
+
log.debug "-- Replacing doctype"
|
46
|
+
s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
|
47
|
+
s
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class DocumentFilters
|
52
|
+
include Filter
|
53
|
+
|
54
|
+
# Set Content-Type charset to UTF-8
|
55
|
+
#
|
56
|
+
filter :fix_content_type do |doc|
|
57
|
+
doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
|
58
|
+
el['content'] = 'text/html; charset=utf-8'
|
59
|
+
end
|
60
|
+
doc
|
61
|
+
end
|
62
|
+
|
63
|
+
# Process styles
|
64
|
+
#
|
65
|
+
filter :fix_styles do |doc|
|
66
|
+
if options[:css] && !options[:css].empty?
|
67
|
+
# Remove all stylesheet links
|
68
|
+
doc.xpath('//head/link[@rel="stylesheet"]').remove
|
69
|
+
if options[:css] == '-'
|
70
|
+
# Also remove all inline styles
|
71
|
+
doc.xpath('//head/style').remove
|
72
|
+
log.info "Removing all stylesheet links and style elements"
|
73
|
+
else
|
74
|
+
# Add custom stylesheet link
|
75
|
+
link = Nokogiri::XML::Node.new('link', doc)
|
76
|
+
link['rel'] = 'stylesheet'
|
77
|
+
link['type'] = 'text/css'
|
78
|
+
link['href'] = File.basename(@options[:css])
|
79
|
+
# Add as the last child so it has precedence over (possible) inline styles before
|
80
|
+
doc.at('//head').add_child(link)
|
81
|
+
log.info "Replacing CSS refs with \"#{link['href']}\""
|
82
|
+
end
|
83
|
+
end
|
84
|
+
doc
|
85
|
+
end
|
86
|
+
|
87
|
+
# Insert elements after/before selector
|
88
|
+
#
|
89
|
+
filter :do_inserts do |doc|
|
90
|
+
options[:after].each do |e|
|
91
|
+
selector = e.keys.first
|
92
|
+
fragment = e[selector]
|
93
|
+
element = doc.xpath(selector).first
|
94
|
+
if element
|
95
|
+
log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
|
96
|
+
fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
|
97
|
+
end
|
98
|
+
end if options[:after]
|
99
|
+
options[:before].each do |e|
|
100
|
+
selector = e.keys.first
|
101
|
+
fragment = e[selector]
|
102
|
+
element = doc.xpath(selector).first
|
103
|
+
if element
|
104
|
+
log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
|
105
|
+
fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
|
106
|
+
end
|
107
|
+
end if options[:before]
|
108
|
+
doc
|
109
|
+
end
|
110
|
+
|
111
|
+
# Remove elements
|
112
|
+
#
|
113
|
+
filter :do_removes do |doc|
|
114
|
+
options[:remove].each do |selector|
|
115
|
+
log.info "Removing elements \"#{selector}\""
|
116
|
+
doc.search(selector).remove
|
117
|
+
end if options[:remove]
|
118
|
+
doc
|
119
|
+
end
|
120
|
+
|
121
|
+
# TODO: XHTML requires a to have embedding element
|
122
|
+
# filter :wrap_anchors do |doc|
|
123
|
+
# log.info "Wrapping anchors"
|
124
|
+
# doc.xpath('//body/a').each do |a|
|
125
|
+
# wrapper = Nokogiri::XML::Node.new('p', doc)
|
126
|
+
# a.add_next_sibling(wrapper)
|
127
|
+
# wrapper << a
|
128
|
+
# end
|
129
|
+
# doc
|
130
|
+
# end
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|