repub 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ require 'fileutils'
2
+ require 'digest/sha1'
3
+ require 'uri'
4
+ require 'iconv'
5
+ require 'rubygems'
6
+
7
+ old_verbose = $VERBOSE
8
+ $VERBOSE = false
9
+ require 'UniversalDetector'
10
+ $VERBOSE = old_verbose
11
+
12
+ module Repub
13
+ class App
14
+ module Fetcher
15
+
16
+ class FetcherException < RuntimeError; end
17
+
18
+ def fetch
19
+ Fetcher.new(options).fetch
20
+ end
21
+
22
+ AssetTypes = {
23
+ :documents => %w[html htm],
24
+ :stylesheets => %w[css],
25
+ :images => %w[jpg jpeg png gif svg]
26
+ }
27
+
28
+ class Fetcher
29
+ include Logger
30
+
31
+ Downloaders = {
32
+ :wget => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
33
+ :httrack => { :cmd => 'httrack', :options => '-gB -r2 +*.css +*.jpg -*.xml -*.html' }
34
+ }
35
+
36
+ def initialize(options)
37
+ @options = options
38
+ @downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
39
+ begin
40
+ downloader = Downloaders[@options[:helper].to_sym] rescue Downloaders[:wget]
41
+ log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
42
+ @downloader_path ||= which(downloader[:cmd])
43
+ @downloader_options ||= downloader[:options]
44
+ rescue RuntimeError
45
+ raise FetcherException, "unknown helper '#{@options[:helper]}'"
46
+ end
47
+ end
48
+
49
+ def fetch
50
+ url = @options[:url]
51
+ raise FetcherException, "empty URL" if !url || url.empty?
52
+ begin
53
+ URI.parse(url)
54
+ rescue
55
+ raise FetcherException, "invalid URL: #{url}"
56
+ end
57
+ cmd = "#{@downloader_path} #{@downloader_options} #{url}"
58
+ Cache.for_url(url) do |cache|
59
+ log.debug "-- Downloading into #{cache.path}"
60
+ unless system(cmd) && !cache.empty?
61
+ raise FetcherException, "Fetch failed."
62
+ end
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ def which(cmd)
69
+ if !RUBY_PLATFORM.match('mswin')
70
+ cmd = `/usr/bin/which #{cmd}`.strip
71
+ raise FetcherException, "#{cmd}: helper not found." if cmd.empty?
72
+ end
73
+ cmd
74
+ end
75
+ end
76
+
77
+ class Cache
78
+ include Logger
79
+
80
+ def self.root
81
+ return File.join(App.data_path, 'cache')
82
+ end
83
+
84
+ def self.inventorize
85
+ # TODO
86
+ end
87
+
88
+ def self.cleanup
89
+ Dir.chdir(self.root) { FileUtils.rm_r(Dir.glob('*')) }
90
+ rescue
91
+ # ignore exceptions
92
+ end
93
+
94
+ attr_reader :url
95
+ attr_reader :name
96
+ attr_reader :path
97
+ attr_reader :assets
98
+
99
+ def self.for_url(url, &block)
100
+ self.new(url).for_url(&block)
101
+ end
102
+
103
+ def for_url(&block)
104
+ # Download stuff if not yet cached
105
+ cached = File.exist?(@path)
106
+ unless cached
107
+ FileUtils.mkdir_p(@path)
108
+ begin
109
+ Dir.chdir(@path) { yield self }
110
+ rescue
111
+ FileUtils.rm_r(@path)
112
+ raise
113
+ end
114
+ else
115
+ log.info "Using cached assets"
116
+ log.debug "-- Cache is #{@path}"
117
+ end
118
+ # Do post-download tasks
119
+ Dir.chdir(@path) do
120
+ # Enumerate assets
121
+ @assets = {}
122
+ AssetTypes.each_pair do |asset_type, file_types|
123
+ @assets[asset_type] ||= []
124
+ file_types.each do |file_type|
125
+ @assets[asset_type] << Dir.glob("*.#{file_type}")
126
+ end
127
+ @assets[asset_type].flatten!
128
+ end
129
+ # For freshly downloaded docs, detect encoding and convert to utf-8
130
+ unless cached
131
+ @assets[:documents].each do |doc|
132
+ log.info "Detecting encoding for #{doc}"
133
+ s = IO.read(doc)
134
+ raise FetcherException, "empty document" unless s
135
+ encoding = UniversalDetector.chardet(s)['encoding']
136
+ if encoding.downcase != 'utf-8'
137
+ log.info "Looks like #{encoding}, converting to UTF-8"
138
+ s = Iconv.conv('utf-8', encoding, IO.read(doc))
139
+ File.open(doc, 'w') { |f| f.write(s) }
140
+ else
141
+ log.info "Looks like UTF-8, no conversion needed"
142
+ end
143
+ end
144
+ end
145
+ end
146
+ self
147
+ end
148
+
149
+ def empty?
150
+ Dir.glob(File.join(@path, '*')).empty?
151
+ end
152
+
153
+ private
154
+
155
+ def initialize(url)
156
+ @url = url
157
+ @name = Digest::SHA1.hexdigest(@url)
158
+ @path = File.join(Cache.root, @name)
159
+ end
160
+ end
161
+
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,52 @@
1
+ require 'singleton'
2
+
3
+ module Repub
4
+ class App
5
+ module Logger
6
+
7
+ # Logging verbosity
8
+ #
9
+ LOGGER_QUIET = 0 # nothing except errors
10
+ LOGGER_NORMAL = 1 # info and above
11
+ LOGGER_VERBOSE = 2 # everything, including debuging noise
12
+
13
+ def log
14
+ Logger.instance
15
+ end
16
+
17
+ class Logger
18
+ include Singleton
19
+
20
+ attr_accessor :level
21
+ attr_accessor :stdout
22
+ attr_accessor :stderr
23
+
24
+ def debug(msg)
25
+ @stdout.puts(msg) if @level >= LOGGER_VERBOSE
26
+ end
27
+
28
+ def info(msg)
29
+ @stdout.puts(msg) if @level >= LOGGER_NORMAL
30
+ end
31
+
32
+ def error(msg)
33
+ @stderr.puts(msg) if @level >= LOGGER_QUIET
34
+ end
35
+ alias_method :warn, :error
36
+
37
+ def fatal(msg)
38
+ error(msg)
39
+ exit 1
40
+ end
41
+
42
+ private
43
+ def initialize
44
+ @level = LOGGER_NORMAL
45
+ @stdout = STDOUT
46
+ @stderr = STDERR
47
+ end
48
+ end
49
+
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,180 @@
1
+ require 'optparse'
2
+
3
+ module Repub
4
+ class App
5
+ module Options
6
+ include Logger
7
+
8
+ attr_reader :options
9
+
10
+ def parse_options(args)
11
+
12
+ # Default options
13
+ @options = {
14
+ :browser => false,
15
+ :css => nil,
16
+ :encoding => nil,
17
+ :fixup => true,
18
+ :helper => 'wget',
19
+ :metadata => {},
20
+ :output_path => Dir.getwd,
21
+ :profile => 'default',
22
+ :remove => [],
23
+ :rx => [],
24
+ :selectors => Parser::Selectors,
25
+ :url => nil,
26
+ :verbosity => Repub::App::Logger::LOGGER_NORMAL,
27
+ }
28
+
29
+ # Load default profile
30
+ if load_profile(options[:profile]).empty?
31
+ write_profile(options[:profile])
32
+ end
33
+
34
+ # Parse command line
35
+ parser = OptionParser.new do |opts|
36
+ opts.banner = <<-BANNER.gsub(/^ /,'')
37
+
38
+ Repub is a simple HTML to ePub converter.
39
+
40
+ Usage: #{App.name} [options] url
41
+
42
+ General options:
43
+ BANNER
44
+
45
+ opts.on("-D", "--downloader NAME ", ['wget', 'httrack'],
46
+ "Which downloader to use to get files (wget or httrack).",
47
+ "Default is #{options[:helper]}."
48
+ ) { |value| options[:helper] = value }
49
+
50
+ opts.on("-o", "--output PATH", String,
51
+ "Output path for generated ePub file.",
52
+ "Default is #{options[:output_path]}/<Parsed_Title>.epub"
53
+ ) { |value| options[:output_path] = File.expand_path(value) }
54
+
55
+ opts.on("-w", "--write-profile NAME", String,
56
+ "Save given options for later reuse as profile NAME."
57
+ ) { |value| options[:profile] = value; write_profile(value) }
58
+
59
+ opts.on("-l", "--load-profile NAME", String,
60
+ "Load options from saved profile NAME."
61
+ ) { |value| options[:profile] = value; load_profile(value) }
62
+
63
+ opts.on("-W", "--write-default",
64
+ "Save given options for later reuse as default profile."
65
+ ) { write_profile }
66
+
67
+ opts.on("-L", "--list-profiles",
68
+ "List saved profiles."
69
+ ) { list_profiles; exit 1 }
70
+
71
+ opts.on("-C", "--cleanup",
72
+ "Clean up download cache."
73
+ ) { Fetcher::Cache.cleanup; exit 1 }
74
+
75
+ opts.on("-v", "--verbose",
76
+ "Turn on verbose output."
77
+ ) { options[:verbosity] = Repub::App::Logger::LOGGER_VERBOSE }
78
+
79
+ opts.on("-q", "--quiet",
80
+ "Turn off any output except errors."
81
+ ) { options[:verbosity] = Repub::App::Logger::LOGGER_QUIET }
82
+
83
+ opts.on("-V", "--version",
84
+ "Show version."
85
+ ) { puts Repub.version; exit 1 }
86
+
87
+ opts.on("-h", "--help",
88
+ "Show this help message."
89
+ ) { help opts; exit 1 }
90
+
91
+ opts.separator ""
92
+ opts.separator " Parser options:"
93
+
94
+ opts.on("-x", "--selector NAME:VALUE", String,
95
+ "Set parser XPath selector NAME to VALUE.",
96
+ "Recognized selectors are: [title toc toc_item toc_section]"
97
+ ) do |value|
98
+ begin
99
+ name, value = value.match(/([^:]+):(.*)/)[1, 2]
100
+ rescue
101
+ log.fatal "ERROR: invalid argument: -x '#{value}'. See '#{App.name} --help'."
102
+ end
103
+ options[:selectors][name.to_sym] = value
104
+ end
105
+
106
+ opts.on("-m", "--meta NAME:VALUE", String,
107
+ "Set publication information metadata NAME to VALUE.",
108
+ "Valid metadata names are: [creator date description",
109
+ "language publisher relation rights subject title]"
110
+ ) do |value|
111
+ begin
112
+ name, value = value.match(/([^:]+):(.*)/)[1, 2]
113
+ rescue
114
+ log.fatal "ERROR: invalid argument: -m '#{value}'. See '#{App.name} --help'."
115
+ end
116
+ options[:metadata][name.to_sym] = value
117
+ end
118
+
119
+ opts.on("-F", "--no-fixup",
120
+ "Do not attempt to make document meet XHTML 1.0 Strict.",
121
+ "Default is to try and fix things that are broken. "
122
+ ) { |value| options[:fixup] = false }
123
+
124
+ opts.on("-e", "--encoding NAME", String,
125
+ "Set source document encoding. Default is to autodetect."
126
+ ) { |value| options[:encoding] = value }
127
+
128
+ opts.separator ""
129
+ opts.separator " Post-processing options:"
130
+
131
+ opts.on("-s", "--stylesheet PATH", String,
132
+ "Use custom stylesheet at PATH to add or override existing",
133
+ "CSS references in the source document."
134
+ ) { |value| options[:css] = File.expand_path(value) }
135
+
136
+ opts.on("-X", "--remove SELECTOR", String,
137
+ "Remove source element using XPath selector.",
138
+ "Use -X- to ignore stored profile."
139
+ ) { |value| value == '-' ? options[:remove] = [] : options[:remove] << value }
140
+
141
+ opts.on("-R", "--rx /PATTERN/REPLACEMENT/", String,
142
+ "Edit source HTML using regular expressions.",
143
+ "Use -R- to ignore stored profile."
144
+ ) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
145
+
146
+ opts.on("-B", "--browse",
147
+ "After processing, open resulting HTML in default browser."
148
+ ) { |value| options[:browser] = true }
149
+
150
+ end
151
+
152
+ if args.empty?
153
+ help parser
154
+ exit 1
155
+ end
156
+
157
+ begin
158
+ parser.parse! args
159
+ rescue OptionParser::ParseError => ex
160
+ log.fatal "ERROR: #{ex.to_s}. See '#{App.name} --help'."
161
+ end
162
+
163
+ options[:url] = args.last
164
+ if options[:url].nil? || options[:url].empty?
165
+ help parser
166
+ log.fatal "ERROR: Please specify an URL."
167
+ end
168
+ end
169
+
170
+ def help(opts)
171
+ puts opts
172
+ puts
173
+ puts " Current profile (#{options[:profile]}):"
174
+ dump_profile(options[:profile])
175
+ puts
176
+ end
177
+
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,152 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+
4
+ module Repub
5
+ class App
6
+ module Parser
7
+
8
+ class ParserException < RuntimeError; end
9
+
10
+ def parse(cache)
11
+ Parser.new(options).parse(cache)
12
+ end
13
+
14
+ # Default selectors
15
+ #
16
+ Selectors = {
17
+ :title => '//h1',
18
+ :toc => '//ul',
19
+ :toc_item => './li',
20
+ :toc_section => './ul'
21
+ }
22
+
23
+ class Parser
24
+ include Logger
25
+
26
+ attr_reader :cache
27
+ attr_reader :uid
28
+ attr_reader :title
29
+ attr_reader :title_html
30
+ attr_reader :toc
31
+
32
+ def initialize(options)
33
+ @selectors = options[:selectors] || Selectors
34
+ @fixup = options[:fixup]
35
+ end
36
+
37
+ def parse(cache)
38
+ raise ParserException, "No HTML document found" if
39
+ cache.assets[:documents].empty?
40
+ raise ParserException, "More than one HTML document found, this is not supported (yet)" if
41
+ cache.assets[:documents].size > 1
42
+
43
+ @cache = cache
44
+ @asset = @cache.assets[:documents][0]
45
+ log.debug "-- Parsing #{@asset}"
46
+ @doc = Nokogiri::HTML.parse(open(File.join(@cache.path, @asset)), nil, 'UTF-8')
47
+
48
+ @uid = @cache.name
49
+ parse_title
50
+ parse_title_html
51
+ parse_toc
52
+
53
+ self
54
+ end
55
+
56
+ private
57
+
58
+ UNTITLED = 'Untitled'
59
+
60
+ def parse_title
61
+ log.debug "-- Looking for title with #{@selectors[:title]}"
62
+ el = @doc.at(@selectors[:title])
63
+ if el
64
+ if el.children.empty?
65
+ title_text = el.inner_text
66
+ else
67
+ title_text = el.children.map{|c| c.inner_text }.join(' ')
68
+ end
69
+ @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
70
+ log.info "Found title \"#{@title}\""
71
+ else
72
+ @title = UNTITLED
73
+ log.warn "** Could not find document title, using '#{@title}'"
74
+ end
75
+ end
76
+
77
+ def parse_title_html
78
+ log.debug "-- Looking for html title with #{@selectors[:title]}"
79
+ el = @doc.at(@selectors[:title])
80
+ @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
81
+ end
82
+
83
+ # Helper container for TOC items
84
+ #
85
+ class TocItem < Struct.new(
86
+ :title,
87
+ :uri,
88
+ :fragment_id
89
+ )
90
+
91
+ def initialize(title, uri_with_fragment_id, subitems, asset)
92
+ self.title = title
93
+ self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
94
+ self.uri = asset if self.uri.empty?
95
+ @subitems = subitems || []
96
+ end
97
+
98
+ attr_reader :subitems
99
+
100
+ def src
101
+ "#{uri}##{fragment_id}"
102
+ end
103
+ end
104
+
105
+ def parse_toc
106
+ log.debug "-- Looking for TOC with #{@selectors[:toc]}"
107
+ el = @doc.xpath(@selectors[:toc]).first
108
+ if el
109
+ @toc = parse_toc_section(el)
110
+ log.info "Found TOC with #{@toc.size} top-level items"
111
+ else
112
+ @toc = []
113
+ log.warn "** Could not find document table of contents"
114
+ end
115
+ end
116
+
117
+ def parse_toc_section(section)
118
+ toc = []
119
+ log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
120
+ section.xpath(@selectors[:toc_item]).each do |item|
121
+ # Get item's anchor and href
122
+ a = item.name == 'a' ? item : item.at('a')
123
+ next if !a
124
+ href = a[:href]
125
+ next if !href
126
+ # Is this a leaf item or node ?
127
+ subsection = item.xpath(@selectors[:toc_section]).first
128
+ if subsection
129
+ # Item has subsection, use anchor text for title
130
+ title = a.inner_text
131
+ else
132
+ # Leaf item, glue inner_text from all children
133
+ title = item.children.map{|c| c.inner_text }.join(' ')
134
+ end
135
+ title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
136
+ log.debug "-- Found item: #{title}"
137
+ # Parse sub-section
138
+ if subsection
139
+ log.debug "-- Found section with #{@selectors[:toc_section]}"
140
+ log.debug "-- >"
141
+ subitems = parse_toc_section(subsection)
142
+ log.debug '-- .'
143
+ end
144
+ toc << TocItem.new(title, href, subitems, @asset)
145
+ end
146
+ toc
147
+ end
148
+ end
149
+
150
+ end
151
+ end
152
+ end