repub 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ require 'uri'
4
4
  require 'iconv'
5
5
  require 'rubygems'
6
6
 
7
- # Temporary disable warnings from chardet
7
+ # Disable warnings from chardet
8
8
  old_verbose = $VERBOSE
9
9
  $VERBOSE = false
10
10
  require 'UniversalDetector'
@@ -17,7 +17,7 @@ module Repub
17
17
  class FetcherException < RuntimeError; end
18
18
 
19
19
  def fetch
20
- Fetcher.new(options).fetch
20
+ FetcherSupport.new(options).fetch
21
21
  end
22
22
 
23
23
  AssetTypes = {
@@ -26,7 +26,7 @@ module Repub
26
26
  :images => %w[jpg jpeg png gif svg]
27
27
  }
28
28
 
29
- class Fetcher
29
+ class FetcherSupport
30
30
  include Logger
31
31
 
32
32
  Downloaders = {
@@ -63,34 +63,21 @@ module Repub
63
63
  raise FetcherException, "Fetch failed."
64
64
  end
65
65
  unless cache.cached?
66
- fix_filenames(cache)
67
- fix_encoding(cache, @options[:encoding])
66
+ preprocess cache
68
67
  end
69
68
  end
70
69
  end
71
70
 
72
71
  private
73
72
 
74
- def fix_filenames(cache)
75
- # TODO: fix non-alphanum characters in doc filenames
76
- end
77
-
78
- def fix_encoding(cache, encoding = nil)
79
- cache.assets[:documents].each do |doc|
80
- unless encoding
81
- log.info "Detecting encoding for #{doc}"
82
- s = IO.read(doc)
83
- raise FetcherException, "empty document" unless s
84
- encoding = UniversalDetector.chardet(s)['encoding']
85
- end
86
- if encoding.downcase != 'utf-8'
87
- log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
88
- s = Iconv.conv('utf-8', encoding, IO.read(doc))
89
- File.open(doc, 'w') { |f| f.write(s) }
90
- end
73
+ def preprocess(cache)
74
+ cache.assets[:documents].each do |file|
75
+ log.info "Preprocessing #{file}"
76
+ s = PreFilters.apply_filters(IO.read(file), @options)
77
+ File.open(file, 'w') { |f| f.write(s) }
91
78
  end
92
79
  end
93
-
80
+
94
81
  def which(cmd)
95
82
  if !RUBY_PLATFORM.match('mswin')
96
83
  cmd = `/usr/bin/which #{cmd}`.strip
@@ -0,0 +1,30 @@
1
+ module Repub
2
+ class App
3
+ module Filter
4
+
5
+ def self.included(base)
6
+ (class << base; self; end).instance_eval do
7
+ define_method(:filter) do |name, &block|
8
+ @filters ||= []
9
+ @filters << {:name => name, :proc => Proc.new(&block) }
10
+ end
11
+ attr_reader :filters
12
+ attr_reader :options
13
+ end
14
+ base.extend(ClassMethods)
15
+ base.extend(Logger)
16
+ end
17
+
18
+ def options
19
+ self.class.options
20
+ end
21
+
22
+ module ClassMethods
23
+ def apply_filters(input, options = nil)
24
+ @options = options
25
+ @filters.inject(input) { |input, filter| filter[:proc].call(input) }
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -17,7 +17,6 @@ module Repub
17
17
  :browser => false,
18
18
  :css => nil,
19
19
  :encoding => nil,
20
- :fixup => true,
21
20
  :helper => 'wget',
22
21
  :metadata => {},
23
22
  :output_path => Dir.getwd,
@@ -119,11 +118,6 @@ module Repub
119
118
  options[:metadata][name.to_sym] = value
120
119
  end
121
120
 
122
- opts.on("-F", "--no-fixup",
123
- "Do not attempt to make document meet XHTML 1.0 Strict.",
124
- "Default is to try and fix things that are broken. "
125
- ) { |value| options[:fixup] = false }
126
-
127
121
  opts.on("-e", "--encoding NAME", String,
128
122
  "Set source document encoding. Default is to autodetect."
129
123
  ) { |value| options[:encoding] = value }
@@ -1,5 +1,6 @@
1
1
  require 'rubygems'
2
2
  require 'nokogiri'
3
+ require 'repub/epub'
3
4
 
4
5
  module Repub
5
6
  class App
@@ -11,7 +12,7 @@ module Repub
11
12
  Parser.new(options).parse(cache)
12
13
  end
13
14
 
14
- # Default selectors
15
+ # Default selectors, some reasonable values
15
16
  #
16
17
  Selectors = {
17
18
  :title => '//h1',
@@ -26,37 +27,36 @@ module Repub
26
27
  attr_reader :cache
27
28
  attr_reader :uid
28
29
  attr_reader :title
29
- attr_reader :title_html
30
30
  attr_reader :toc
31
31
 
32
32
  def initialize(options)
33
33
  @selectors = options[:selectors] || Selectors
34
- @fixup = options[:fixup]
35
34
  end
36
35
 
36
+ # Parse downloaded asset cache
37
+ #
37
38
  def parse(cache)
38
39
  raise ParserException, "No HTML document found" if
39
40
  cache.assets[:documents].empty?
41
+ # TODO: limited to a single document only
40
42
  raise ParserException, "More than one HTML document found, this is not supported (yet)" if
41
43
  cache.assets[:documents].size > 1
42
44
 
43
45
  @cache = cache
44
- @asset = @cache.assets[:documents][0]
45
- log.debug "-- Parsing #{@asset}"
46
- @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @asset)), nil, 'UTF-8')
46
+ @document = @cache.assets[:documents][0]
47
+ log.debug "-- Parsing #{@document}"
48
+ @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
47
49
 
48
50
  @uid = @cache.name
49
51
  parse_title
50
- parse_title_html
51
52
  parse_toc
52
-
53
53
  self
54
54
  end
55
55
 
56
56
  private
57
-
58
- UNTITLED = 'Untitled'
59
-
57
+
58
+ # Parse document title
59
+ #
60
60
  def parse_title
61
61
  log.debug "-- Looking for title with #{@selectors[:title]}"
62
62
  el = @doc.at(@selectors[:title])
@@ -69,82 +69,72 @@ module Repub
69
69
  @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
70
70
  log.info "Found title \"#{@title}\""
71
71
  else
72
- @title = UNTITLED
72
+ @title = 'Untitled'
73
73
  log.warn "** Could not find document title, using '#{@title}'"
74
74
  end
75
75
  end
76
-
77
- def parse_title_html
78
- log.debug "-- Looking for html title with #{@selectors[:title]}"
79
- el = @doc.at(@selectors[:title])
80
- @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
81
- end
82
-
83
- # Helper container for TOC items
76
+
77
+ # Parsed TOC item container
78
+ # Inherit from NavPoint to avoid conversions later in Builder
84
79
  #
85
- class TocItem < Struct.new(
86
- :title,
87
- :uri,
88
- :fragment_id
89
- )
80
+ class TocItem < Repub::Epub::NCX::NavPoint
90
81
 
91
- def initialize(title, uri_with_fragment_id, subitems, asset)
92
- self.title = title
93
- self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
94
- self.uri = asset if self.uri.empty?
95
- @subitems = subitems || []
82
+ def initialize(title, uri_with_fragment_id, subitems, document)
83
+ uri, fragment_id = uri_with_fragment_id.split(/#/)
84
+ uri = document if uri.empty?
85
+ super(title, "#{uri}##{fragment_id}", subitems)
96
86
  end
97
-
98
- attr_reader :subitems
99
-
100
- def src
101
- "#{uri}##{fragment_id}"
102
- end
103
- end
104
87
 
88
+ end
89
+
90
+ # Look for TOC and recursively parse it
91
+ #
105
92
  def parse_toc
93
+ @toc = []
94
+ depth = 0
95
+
96
+ l = lambda do |section|
97
+ toc_items = []
98
+ depth += 1
99
+ section.xpath(@selectors[:toc_item]).each do |item|
100
+ # Get item's anchor and href
101
+ a = item.name == 'a' ? item : item.at('a')
102
+ next if !a
103
+ href = a['href']
104
+ next if !href
105
+
106
+ # Is this a leaf item or node? Title parsing depends on that.
107
+ subsection = item.xpath(@selectors[:toc_section]).first
108
+ if subsection
109
+ # Item has subsection, use anchor text for title
110
+ title = a.inner_text
111
+ else
112
+ # Leaf item, it is safe to glue inner_text from all children
113
+ title = item.children.map{|c| c.inner_text }.join(' ')
114
+ end
115
+ title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
116
+ log.debug "-- #{" " * depth}#{title}"
117
+
118
+ # Parse subsection
119
+ subitems = l.call(subsection) if subsection
120
+
121
+ toc_items << TocItem.new(title, href, subitems, @document)
122
+ end
123
+ depth -= 1
124
+ toc_items
125
+ end
126
+
106
127
  log.debug "-- Looking for TOC with #{@selectors[:toc]}"
107
- el = @doc.xpath(@selectors[:toc]).first
108
- if el
109
- @toc = parse_toc_section(el)
128
+ toc_element = @doc.xpath(@selectors[:toc]).first
129
+
130
+ if toc_element
131
+ log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
132
+ @toc = l.call(toc_element)
110
133
  log.info "Found TOC with #{@toc.size} top-level items"
111
134
  else
112
- @toc = []
113
135
  log.warn "** Could not find document table of contents"
114
136
  end
115
137
  end
116
-
117
- def parse_toc_section(section)
118
- toc = []
119
- log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
120
- section.xpath(@selectors[:toc_item]).each do |item|
121
- # Get item's anchor and href
122
- a = item.name == 'a' ? item : item.at('a')
123
- next if !a
124
- href = a['href']
125
- next if !href
126
- # Is this a leaf item or node ?
127
- subsection = item.xpath(@selectors[:toc_section]).first
128
- if subsection
129
- # Item has subsection, use anchor text for title
130
- title = a.inner_text
131
- else
132
- # Leaf item, glue inner_text from all children
133
- title = item.children.map{|c| c.inner_text }.join(' ')
134
- end
135
- title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
136
- log.debug "-- Found item: #{title}"
137
- # Parse sub-section
138
- if subsection
139
- log.debug "-- Found section with #{@selectors[:toc_section]}"
140
- log.debug "-- >"
141
- subitems = parse_toc_section(subsection)
142
- log.debug '-- .'
143
- end
144
- toc << TocItem.new(title, href, subitems, @asset)
145
- end
146
- toc
147
- end
148
138
  end
149
139
 
150
140
  end
@@ -0,0 +1,135 @@
1
+ require 'repub/app/filter'
2
+
3
+ module Repub
4
+ class App
5
+ class PostFilters
6
+
7
+ class FileFilters
8
+ include Filter
9
+
10
+ # Do rx substitutions
11
+ #
12
+ filter :do_rxes do |s|
13
+ options[:rx].each do |rx|
14
+ rx.strip!
15
+ delimiter = rx[0, 1]
16
+ rx = rx.gsub(/\\#{delimiter}/, "\n")
17
+ ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
18
+ raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
19
+ pattern = ra[0]
20
+ replacement = ra[1] || ''
21
+ log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
22
+ s.gsub!(Regexp.new(pattern), replacement)
23
+ end if options[:rx]
24
+ s
25
+ end
26
+
27
+ # Remove xml preamble if any
28
+ #
29
+ filter :fix_xml_preamble do |s|
30
+ preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
31
+ if s =~ preamble_rx
32
+ log.debug "-- Removing xml preamble"
33
+ s.sub!(preamble_rx, '')
34
+ end
35
+ s
36
+ end
37
+
38
+ # Replace doctype
39
+ #
40
+ filter :fix_doctype do |s|
41
+ doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
42
+ if s =~ doctype_rx
43
+ s.sub!(doctype_rx, '')
44
+ end
45
+ log.debug "-- Replacing doctype"
46
+ s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
47
+ s
48
+ end
49
+ end
50
+
51
+ class DocumentFilters
52
+ include Filter
53
+
54
+ # Set Content-Type charset to UTF-8
55
+ #
56
+ filter :fix_content_type do |doc|
57
+ doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
58
+ el['content'] = 'text/html; charset=utf-8'
59
+ end
60
+ doc
61
+ end
62
+
63
+ # Process styles
64
+ #
65
+ filter :fix_styles do |doc|
66
+ if options[:css] && !options[:css].empty?
67
+ # Remove all stylesheet links
68
+ doc.xpath('//head/link[@rel="stylesheet"]').remove
69
+ if options[:css] == '-'
70
+ # Also remove all inline styles
71
+ doc.xpath('//head/style').remove
72
+ log.info "Removing all stylesheet links and style elements"
73
+ else
74
+ # Add custom stylesheet link
75
+ link = Nokogiri::XML::Node.new('link', doc)
76
+ link['rel'] = 'stylesheet'
77
+ link['type'] = 'text/css'
78
+ link['href'] = File.basename(@options[:css])
79
+ # Add as the last child so it has precedence over (possible) inline styles before
80
+ doc.at('//head').add_child(link)
81
+ log.info "Replacing CSS refs with \"#{link['href']}\""
82
+ end
83
+ end
84
+ doc
85
+ end
86
+
87
+ # Insert elements after/before selector
88
+ #
89
+ filter :do_inserts do |doc|
90
+ options[:after].each do |e|
91
+ selector = e.keys.first
92
+ fragment = e[selector]
93
+ element = doc.xpath(selector).first
94
+ if element
95
+ log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
96
+ fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
97
+ end
98
+ end if options[:after]
99
+ options[:before].each do |e|
100
+ selector = e.keys.first
101
+ fragment = e[selector]
102
+ element = doc.xpath(selector).first
103
+ if element
104
+ log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
105
+ fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
106
+ end
107
+ end if options[:before]
108
+ doc
109
+ end
110
+
111
+ # Remove elements
112
+ #
113
+ filter :do_removes do |doc|
114
+ options[:remove].each do |selector|
115
+ log.info "Removing elements \"#{selector}\""
116
+ doc.search(selector).remove
117
+ end if options[:remove]
118
+ doc
119
+ end
120
+
121
+ # TODO: XHTML requires a to have embedding element
122
+ # filter :wrap_anchors do |doc|
123
+ # log.info "Wrapping anchors"
124
+ # doc.xpath('//body/a').each do |a|
125
+ # wrapper = Nokogiri::XML::Node.new('p', doc)
126
+ # a.add_next_sibling(wrapper)
127
+ # wrapper << a
128
+ # end
129
+ # doc
130
+ # end
131
+ end
132
+
133
+ end
134
+ end
135
+ end