invisiblellama-repub 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,7 @@ require 'uri'
4
4
  require 'iconv'
5
5
  require 'rubygems'
6
6
 
7
- # Temporary disable warnings from chardet
7
+ # Disable warnings from chardet
8
8
  old_verbose = $VERBOSE
9
9
  $VERBOSE = false
10
10
  require 'UniversalDetector'
@@ -17,7 +17,7 @@ module Repub
17
17
  class FetcherException < RuntimeError; end
18
18
 
19
19
  def fetch
20
- Fetcher.new(options).fetch
20
+ FetcherSupport.new(options).fetch
21
21
  end
22
22
 
23
23
  AssetTypes = {
@@ -26,7 +26,7 @@ module Repub
26
26
  :images => %w[jpg jpeg png gif svg]
27
27
  }
28
28
 
29
- class Fetcher
29
+ class FetcherSupport
30
30
  include Logger
31
31
 
32
32
  Downloaders = {
@@ -63,34 +63,21 @@ module Repub
63
63
  raise FetcherException, "Fetch failed."
64
64
  end
65
65
  unless cache.cached?
66
- fix_filenames(cache)
67
- fix_encoding(cache, @options[:encoding])
66
+ preprocess cache
68
67
  end
69
68
  end
70
69
  end
71
70
 
72
71
  private
73
72
 
74
- def fix_filenames(cache)
75
- # TODO: fix non-alphanum characters in doc filenames
76
- end
77
-
78
- def fix_encoding(cache, encoding = nil)
79
- cache.assets[:documents].each do |doc|
80
- unless encoding
81
- log.info "Detecting encoding for #{doc}"
82
- s = IO.read(doc)
83
- raise FetcherException, "empty document" unless s
84
- encoding = UniversalDetector.chardet(s)['encoding']
85
- end
86
- if encoding.downcase != 'utf-8'
87
- log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
88
- s = Iconv.conv('utf-8', encoding, IO.read(doc))
89
- File.open(doc, 'w') { |f| f.write(s) }
90
- end
73
+ def preprocess(cache)
74
+ cache.assets[:documents].each do |file|
75
+ log.info "Preprocessing #{file}"
76
+ s = PreFilters.apply_filters(IO.read(file), @options)
77
+ File.open(file, 'w') { |f| f.write(s) }
91
78
  end
92
79
  end
93
-
80
+
94
81
  def which(cmd)
95
82
  if !RUBY_PLATFORM.match('mswin')
96
83
  cmd = `/usr/bin/which #{cmd}`.strip
@@ -0,0 +1,30 @@
1
+ module Repub
2
+ class App
3
+ module Filter
4
+
5
+ def self.included(base)
6
+ (class << base; self; end).instance_eval do
7
+ define_method(:filter) do |name, &block|
8
+ @filters ||= []
9
+ @filters << {:name => name, :proc => Proc.new(&block) }
10
+ end
11
+ attr_reader :filters
12
+ attr_reader :options
13
+ end
14
+ base.extend(ClassMethods)
15
+ base.extend(Logger)
16
+ end
17
+
18
+ def options
19
+ self.class.options
20
+ end
21
+
22
+ module ClassMethods
23
+ def apply_filters(input, options = nil)
24
+ @options = options
25
+ @filters.inject(input) { |input, filter| filter[:proc].call(input) }
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -17,7 +17,6 @@ module Repub
17
17
  :browser => false,
18
18
  :css => nil,
19
19
  :encoding => nil,
20
- :fixup => true,
21
20
  :helper => 'wget',
22
21
  :metadata => {},
23
22
  :output_path => Dir.getwd,
@@ -119,11 +118,6 @@ module Repub
119
118
  options[:metadata][name.to_sym] = value
120
119
  end
121
120
 
122
- opts.on("-F", "--no-fixup",
123
- "Do not attempt to make document meet XHTML 1.0 Strict.",
124
- "Default is to try and fix things that are broken. "
125
- ) { |value| options[:fixup] = false }
126
-
127
121
  opts.on("-e", "--encoding NAME", String,
128
122
  "Set source document encoding. Default is to autodetect."
129
123
  ) { |value| options[:encoding] = value }
@@ -1,5 +1,6 @@
1
1
  require 'rubygems'
2
2
  require 'nokogiri'
3
+ require 'repub/epub'
3
4
 
4
5
  module Repub
5
6
  class App
@@ -11,7 +12,7 @@ module Repub
11
12
  Parser.new(options).parse(cache)
12
13
  end
13
14
 
14
- # Default selectors
15
+ # Default selectors, some reasonable values
15
16
  #
16
17
  Selectors = {
17
18
  :title => '//h1',
@@ -26,37 +27,36 @@ module Repub
26
27
  attr_reader :cache
27
28
  attr_reader :uid
28
29
  attr_reader :title
29
- attr_reader :title_html
30
30
  attr_reader :toc
31
31
 
32
32
  def initialize(options)
33
33
  @selectors = options[:selectors] || Selectors
34
- @fixup = options[:fixup]
35
34
  end
36
35
 
36
+ # Parse downloaded asset cache
37
+ #
37
38
  def parse(cache)
38
39
  raise ParserException, "No HTML document found" if
39
40
  cache.assets[:documents].empty?
41
+ # TODO: limited to a single document only
40
42
  raise ParserException, "More than one HTML document found, this is not supported (yet)" if
41
43
  cache.assets[:documents].size > 1
42
44
 
43
45
  @cache = cache
44
- @asset = @cache.assets[:documents][0]
45
- log.debug "-- Parsing #{@asset}"
46
- @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @asset)), nil, 'UTF-8')
46
+ @document = @cache.assets[:documents][0]
47
+ log.debug "-- Parsing #{@document}"
48
+ @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
47
49
 
48
50
  @uid = @cache.name
49
51
  parse_title
50
- parse_title_html
51
52
  parse_toc
52
-
53
53
  self
54
54
  end
55
55
 
56
56
  private
57
-
58
- UNTITLED = 'Untitled'
59
-
57
+
58
+ # Parse document title
59
+ #
60
60
  def parse_title
61
61
  log.debug "-- Looking for title with #{@selectors[:title]}"
62
62
  el = @doc.at(@selectors[:title])
@@ -69,82 +69,72 @@ module Repub
69
69
  @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
70
70
  log.info "Found title \"#{@title}\""
71
71
  else
72
- @title = UNTITLED
72
+ @title = 'Untitled'
73
73
  log.warn "** Could not find document title, using '#{@title}'"
74
74
  end
75
75
  end
76
-
77
- def parse_title_html
78
- log.debug "-- Looking for html title with #{@selectors[:title]}"
79
- el = @doc.at(@selectors[:title])
80
- @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
81
- end
82
-
83
- # Helper container for TOC items
76
+
77
+ # Parsed TOC item container
78
+ # Inherit from NavPoint to avoid conversions later in Builder
84
79
  #
85
- class TocItem < Struct.new(
86
- :title,
87
- :uri,
88
- :fragment_id
89
- )
80
+ class TocItem < Repub::Epub::NCX::NavPoint
90
81
 
91
- def initialize(title, uri_with_fragment_id, subitems, asset)
92
- self.title = title
93
- self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
94
- self.uri = asset if self.uri.empty?
95
- @subitems = subitems || []
82
+ def initialize(title, uri_with_fragment_id, subitems, document)
83
+ uri, fragment_id = uri_with_fragment_id.split(/#/)
84
+ uri = document if uri.empty?
85
+ super(title, "#{uri}##{fragment_id}", subitems)
96
86
  end
97
-
98
- attr_reader :subitems
99
-
100
- def src
101
- "#{uri}##{fragment_id}"
102
- end
103
- end
104
87
 
88
+ end
89
+
90
+ # Look for TOC and recursively parse it
91
+ #
105
92
  def parse_toc
93
+ @toc = []
94
+ depth = 0
95
+
96
+ l = lambda do |section|
97
+ toc_items = []
98
+ depth += 1
99
+ section.xpath(@selectors[:toc_item]).each do |item|
100
+ # Get item's anchor and href
101
+ a = item.name == 'a' ? item : item.at('a')
102
+ next if !a
103
+ href = a['href']
104
+ next if !href
105
+
106
+ # Is this a leaf item or node? Title parsing depends on that.
107
+ subsection = item.xpath(@selectors[:toc_section]).first
108
+ if subsection
109
+ # Item has subsection, use anchor text for title
110
+ title = a.inner_text
111
+ else
112
+ # Leaf item, it is safe to glue inner_text from all children
113
+ title = item.children.map{|c| c.inner_text }.join(' ')
114
+ end
115
+ title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
116
+ log.debug "-- #{" " * depth}#{title}"
117
+
118
+ # Parse subsection
119
+ subitems = l.call(subsection) if subsection
120
+
121
+ toc_items << TocItem.new(title, href, subitems, @document)
122
+ end
123
+ depth -= 1
124
+ toc_items
125
+ end
126
+
106
127
  log.debug "-- Looking for TOC with #{@selectors[:toc]}"
107
- el = @doc.xpath(@selectors[:toc]).first
108
- if el
109
- @toc = parse_toc_section(el)
128
+ toc_element = @doc.xpath(@selectors[:toc]).first
129
+
130
+ if toc_element
131
+ log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
132
+ @toc = l.call(toc_element)
110
133
  log.info "Found TOC with #{@toc.size} top-level items"
111
134
  else
112
- @toc = []
113
135
  log.warn "** Could not find document table of contents"
114
136
  end
115
137
  end
116
-
117
- def parse_toc_section(section)
118
- toc = []
119
- log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
120
- section.xpath(@selectors[:toc_item]).each do |item|
121
- # Get item's anchor and href
122
- a = item.name == 'a' ? item : item.at('a')
123
- next if !a
124
- href = a['href']
125
- next if !href
126
- # Is this a leaf item or node ?
127
- subsection = item.xpath(@selectors[:toc_section]).first
128
- if subsection
129
- # Item has subsection, use anchor text for title
130
- title = a.inner_text
131
- else
132
- # Leaf item, glue inner_text from all children
133
- title = item.children.map{|c| c.inner_text }.join(' ')
134
- end
135
- title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
136
- log.debug "-- Found item: #{title}"
137
- # Parse sub-section
138
- if subsection
139
- log.debug "-- Found section with #{@selectors[:toc_section]}"
140
- log.debug "-- >"
141
- subitems = parse_toc_section(subsection)
142
- log.debug '-- .'
143
- end
144
- toc << TocItem.new(title, href, subitems, @asset)
145
- end
146
- toc
147
- end
148
138
  end
149
139
 
150
140
  end
@@ -0,0 +1,135 @@
1
+ require 'repub/app/filter'
2
+
3
+ module Repub
4
+ class App
5
+ class PostFilters
6
+
7
+ class FileFilters
8
+ include Filter
9
+
10
+ # Do rx substitutions
11
+ #
12
+ filter :do_rxes do |s|
13
+ options[:rx].each do |rx|
14
+ rx.strip!
15
+ delimiter = rx[0, 1]
16
+ rx = rx.gsub(/\\#{delimiter}/, "\n")
17
+ ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
18
+ raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
19
+ pattern = ra[0]
20
+ replacement = ra[1] || ''
21
+ log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
22
+ s.gsub!(Regexp.new(pattern), replacement)
23
+ end if options[:rx]
24
+ s
25
+ end
26
+
27
+ # Remove xml preamble if any
28
+ #
29
+ filter :fix_xml_preamble do |s|
30
+ preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
31
+ if s =~ preamble_rx
32
+ log.debug "-- Removing xml preamble"
33
+ s.sub!(preamble_rx, '')
34
+ end
35
+ s
36
+ end
37
+
38
+ # Replace doctype
39
+ #
40
+ filter :fix_doctype do |s|
41
+ doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
42
+ if s =~ doctype_rx
43
+ s.sub!(doctype_rx, '')
44
+ end
45
+ log.debug "-- Replacing doctype"
46
+ s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
47
+ s
48
+ end
49
+ end
50
+
51
+ class DocumentFilters
52
+ include Filter
53
+
54
+ # Set Content-Type charset to UTF-8
55
+ #
56
+ filter :fix_content_type do |doc|
57
+ doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
58
+ el['content'] = 'text/html; charset=utf-8'
59
+ end
60
+ doc
61
+ end
62
+
63
+ # Process styles
64
+ #
65
+ filter :fix_styles do |doc|
66
+ if options[:css] && !options[:css].empty?
67
+ # Remove all stylesheet links
68
+ doc.xpath('//head/link[@rel="stylesheet"]').remove
69
+ if options[:css] == '-'
70
+ # Also remove all inline styles
71
+ doc.xpath('//head/style').remove
72
+ log.info "Removing all stylesheet links and style elements"
73
+ else
74
+ # Add custom stylesheet link
75
+ link = Nokogiri::XML::Node.new('link', doc)
76
+ link['rel'] = 'stylesheet'
77
+ link['type'] = 'text/css'
78
+ link['href'] = File.basename(@options[:css])
79
+ # Add as the last child so it has precedence over (possible) inline styles before
80
+ doc.at('//head').add_child(link)
81
+ log.info "Replacing CSS refs with \"#{link['href']}\""
82
+ end
83
+ end
84
+ doc
85
+ end
86
+
87
+ # Insert elements after/before selector
88
+ #
89
+ filter :do_inserts do |doc|
90
+ options[:after].each do |e|
91
+ selector = e.keys.first
92
+ fragment = e[selector]
93
+ element = doc.xpath(selector).first
94
+ if element
95
+ log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
96
+ fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
97
+ end
98
+ end if options[:after]
99
+ options[:before].each do |e|
100
+ selector = e.keys.first
101
+ fragment = e[selector]
102
+ element = doc.xpath(selector).first
103
+ if element
104
+ log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
105
+ fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
106
+ end
107
+ end if options[:before]
108
+ doc
109
+ end
110
+
111
+ # Remove elements
112
+ #
113
+ filter :do_removes do |doc|
114
+ options[:remove].each do |selector|
115
+ log.info "Removing elements \"#{selector}\""
116
+ doc.search(selector).remove
117
+ end if options[:remove]
118
+ doc
119
+ end
120
+
121
+ # TODO: XHTML requires a to have embedding element
122
+ # filter :wrap_anchors do |doc|
123
+ # log.info "Wrapping anchors"
124
+ # doc.xpath('//body/a').each do |a|
125
+ # wrapper = Nokogiri::XML::Node.new('p', doc)
126
+ # a.add_next_sibling(wrapper)
127
+ # wrapper << a
128
+ # end
129
+ # doc
130
+ # end
131
+ end
132
+
133
+ end
134
+ end
135
+ end