repub 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,164 @@
1
+ require 'fileutils'
2
+ require 'digest/sha1'
3
+ require 'uri'
4
+ require 'iconv'
5
+ require 'rubygems'
6
+
7
+ old_verbose = $VERBOSE
8
+ $VERBOSE = false
9
+ require 'UniversalDetector'
10
+ $VERBOSE = old_verbose
11
+
12
+ module Repub
13
+ class App
14
+ module Fetcher
15
+
16
+ class FetcherException < RuntimeError; end
17
+
18
+ def fetch
19
+ Fetcher.new(options).fetch
20
+ end
21
+
22
+ AssetTypes = {
23
+ :documents => %w[html htm],
24
+ :stylesheets => %w[css],
25
+ :images => %w[jpg jpeg png gif svg]
26
+ }
27
+
28
+ class Fetcher
29
+ include Logger
30
+
31
+ Downloaders = {
32
+ :wget => { :cmd => 'wget', :options => '-nv -E -H -k -p -nH -nd' },
33
+ :httrack => { :cmd => 'httrack', :options => '-gB -r2 +*.css +*.jpg -*.xml -*.html' }
34
+ }
35
+
36
+ def initialize(options)
37
+ @options = options
38
+ @downloader_path, @downloader_options = ENV['REPUB_DOWNLOADER'], ENV['REPUB_DOWNLOADER_OPTIONS']
39
+ begin
40
+ downloader = Downloaders[@options[:helper].to_sym] rescue Downloaders[:wget]
41
+ log.debug "-- Using #{downloader[:cmd]} #{downloader[:options]}"
42
+ @downloader_path ||= which(downloader[:cmd])
43
+ @downloader_options ||= downloader[:options]
44
+ rescue RuntimeError
45
+ raise FetcherException, "unknown helper '#{@options[:helper]}'"
46
+ end
47
+ end
48
+
49
+ def fetch
50
+ url = @options[:url]
51
+ raise FetcherException, "empty URL" if !url || url.empty?
52
+ begin
53
+ URI.parse(url)
54
+ rescue
55
+ raise FetcherException, "invalid URL: #{url}"
56
+ end
57
+ cmd = "#{@downloader_path} #{@downloader_options} #{url}"
58
+ Cache.for_url(url) do |cache|
59
+ log.debug "-- Downloading into #{cache.path}"
60
+ unless system(cmd) && !cache.empty?
61
+ raise FetcherException, "Fetch failed."
62
+ end
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ def which(cmd)
69
+ if !RUBY_PLATFORM.match('mswin')
70
+ cmd = `/usr/bin/which #{cmd}`.strip
71
+ raise FetcherException, "#{cmd}: helper not found." if cmd.empty?
72
+ end
73
+ cmd
74
+ end
75
+ end
76
+
77
+ class Cache
78
+ include Logger
79
+
80
+ def self.root
81
+ return File.join(App.data_path, 'cache')
82
+ end
83
+
84
+ def self.inventorize
85
+ # TODO
86
+ end
87
+
88
+ def self.cleanup
89
+ Dir.chdir(self.root) { FileUtils.rm_r(Dir.glob('*')) }
90
+ rescue
91
+ # ignore exceptions
92
+ end
93
+
94
+ attr_reader :url
95
+ attr_reader :name
96
+ attr_reader :path
97
+ attr_reader :assets
98
+
99
+ def self.for_url(url, &block)
100
+ self.new(url).for_url(&block)
101
+ end
102
+
103
+ def for_url(&block)
104
+ # Download stuff if not yet cached
105
+ cached = File.exist?(@path)
106
+ unless cached
107
+ FileUtils.mkdir_p(@path)
108
+ begin
109
+ Dir.chdir(@path) { yield self }
110
+ rescue
111
+ FileUtils.rm_r(@path)
112
+ raise
113
+ end
114
+ else
115
+ log.info "Using cached assets"
116
+ log.debug "-- Cache is #{@path}"
117
+ end
118
+ # Do post-download tasks
119
+ Dir.chdir(@path) do
120
+ # Enumerate assets
121
+ @assets = {}
122
+ AssetTypes.each_pair do |asset_type, file_types|
123
+ @assets[asset_type] ||= []
124
+ file_types.each do |file_type|
125
+ @assets[asset_type] << Dir.glob("*.#{file_type}")
126
+ end
127
+ @assets[asset_type].flatten!
128
+ end
129
+ # For freshly downloaded docs, detect encoding and convert to utf-8
130
+ unless cached
131
+ @assets[:documents].each do |doc|
132
+ log.info "Detecting encoding for #{doc}"
133
+ s = IO.read(doc)
134
+ raise FetcherException, "empty document" unless s
135
+ encoding = UniversalDetector.chardet(s)['encoding']
136
+ if encoding.downcase != 'utf-8'
137
+ log.info "Looks like #{encoding}, converting to UTF-8"
138
+ s = Iconv.conv('utf-8', encoding, IO.read(doc))
139
+ File.open(doc, 'w') { |f| f.write(s) }
140
+ else
141
+ log.info "Looks like UTF-8, no conversion needed"
142
+ end
143
+ end
144
+ end
145
+ end
146
+ self
147
+ end
148
+
149
+ def empty?
150
+ Dir.glob(File.join(@path, '*')).empty?
151
+ end
152
+
153
+ private
154
+
155
+ def initialize(url)
156
+ @url = url
157
+ @name = Digest::SHA1.hexdigest(@url)
158
+ @path = File.join(Cache.root, @name)
159
+ end
160
+ end
161
+
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,52 @@
1
+ require 'singleton'
2
+
3
+ module Repub
4
+ class App
5
+ module Logger
6
+
7
+ # Logging verbosity
8
+ #
9
+ LOGGER_QUIET = 0 # nothing except errors
10
+ LOGGER_NORMAL = 1 # info and above
11
+ LOGGER_VERBOSE = 2 # everything, including debuging noise
12
+
13
+ def log
14
+ Logger.instance
15
+ end
16
+
17
+ class Logger
18
+ include Singleton
19
+
20
+ attr_accessor :level
21
+ attr_accessor :stdout
22
+ attr_accessor :stderr
23
+
24
+ def debug(msg)
25
+ @stdout.puts(msg) if @level >= LOGGER_VERBOSE
26
+ end
27
+
28
+ def info(msg)
29
+ @stdout.puts(msg) if @level >= LOGGER_NORMAL
30
+ end
31
+
32
+ def error(msg)
33
+ @stderr.puts(msg) if @level >= LOGGER_QUIET
34
+ end
35
+ alias_method :warn, :error
36
+
37
+ def fatal(msg)
38
+ error(msg)
39
+ exit 1
40
+ end
41
+
42
+ private
43
+ def initialize
44
+ @level = LOGGER_NORMAL
45
+ @stdout = STDOUT
46
+ @stderr = STDERR
47
+ end
48
+ end
49
+
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,180 @@
1
+ require 'optparse'
2
+
3
+ module Repub
4
+ class App
5
+ module Options
6
+ include Logger
7
+
8
+ attr_reader :options
9
+
10
+ def parse_options(args)
11
+
12
+ # Default options
13
+ @options = {
14
+ :browser => false,
15
+ :css => nil,
16
+ :encoding => nil,
17
+ :fixup => true,
18
+ :helper => 'wget',
19
+ :metadata => {},
20
+ :output_path => Dir.getwd,
21
+ :profile => 'default',
22
+ :remove => [],
23
+ :rx => [],
24
+ :selectors => Parser::Selectors,
25
+ :url => nil,
26
+ :verbosity => Repub::App::Logger::LOGGER_NORMAL,
27
+ }
28
+
29
+ # Load default profile
30
+ if load_profile(options[:profile]).empty?
31
+ write_profile(options[:profile])
32
+ end
33
+
34
+ # Parse command line
35
+ parser = OptionParser.new do |opts|
36
+ opts.banner = <<-BANNER.gsub(/^ /,'')
37
+
38
+ Repub is a simple HTML to ePub converter.
39
+
40
+ Usage: #{App.name} [options] url
41
+
42
+ General options:
43
+ BANNER
44
+
45
+ opts.on("-D", "--downloader NAME ", ['wget', 'httrack'],
46
+ "Which downloader to use to get files (wget or httrack).",
47
+ "Default is #{options[:helper]}."
48
+ ) { |value| options[:helper] = value }
49
+
50
+ opts.on("-o", "--output PATH", String,
51
+ "Output path for generated ePub file.",
52
+ "Default is #{options[:output_path]}/<Parsed_Title>.epub"
53
+ ) { |value| options[:output_path] = File.expand_path(value) }
54
+
55
+ opts.on("-w", "--write-profile NAME", String,
56
+ "Save given options for later reuse as profile NAME."
57
+ ) { |value| options[:profile] = value; write_profile(value) }
58
+
59
+ opts.on("-l", "--load-profile NAME", String,
60
+ "Load options from saved profile NAME."
61
+ ) { |value| options[:profile] = value; load_profile(value) }
62
+
63
+ opts.on("-W", "--write-default",
64
+ "Save given options for later reuse as default profile."
65
+ ) { write_profile }
66
+
67
+ opts.on("-L", "--list-profiles",
68
+ "List saved profiles."
69
+ ) { list_profiles; exit 1 }
70
+
71
+ opts.on("-C", "--cleanup",
72
+ "Clean up download cache."
73
+ ) { Fetcher::Cache.cleanup; exit 1 }
74
+
75
+ opts.on("-v", "--verbose",
76
+ "Turn on verbose output."
77
+ ) { options[:verbosity] = Repub::App::Logger::LOGGER_VERBOSE }
78
+
79
+ opts.on("-q", "--quiet",
80
+ "Turn off any output except errors."
81
+ ) { options[:verbosity] = Repub::App::Logger::LOGGER_QUIET }
82
+
83
+ opts.on("-V", "--version",
84
+ "Show version."
85
+ ) { puts Repub.version; exit 1 }
86
+
87
+ opts.on("-h", "--help",
88
+ "Show this help message."
89
+ ) { help opts; exit 1 }
90
+
91
+ opts.separator ""
92
+ opts.separator " Parser options:"
93
+
94
+ opts.on("-x", "--selector NAME:VALUE", String,
95
+ "Set parser XPath selector NAME to VALUE.",
96
+ "Recognized selectors are: [title toc toc_item toc_section]"
97
+ ) do |value|
98
+ begin
99
+ name, value = value.match(/([^:]+):(.*)/)[1, 2]
100
+ rescue
101
+ log.fatal "ERROR: invalid argument: -x '#{value}'. See '#{App.name} --help'."
102
+ end
103
+ options[:selectors][name.to_sym] = value
104
+ end
105
+
106
+ opts.on("-m", "--meta NAME:VALUE", String,
107
+ "Set publication information metadata NAME to VALUE.",
108
+ "Valid metadata names are: [creator date description",
109
+ "language publisher relation rights subject title]"
110
+ ) do |value|
111
+ begin
112
+ name, value = value.match(/([^:]+):(.*)/)[1, 2]
113
+ rescue
114
+ log.fatal "ERROR: invalid argument: -m '#{value}'. See '#{App.name} --help'."
115
+ end
116
+ options[:metadata][name.to_sym] = value
117
+ end
118
+
119
+ opts.on("-F", "--no-fixup",
120
+ "Do not attempt to make document meet XHTML 1.0 Strict.",
121
+ "Default is to try and fix things that are broken. "
122
+ ) { |value| options[:fixup] = false }
123
+
124
+ opts.on("-e", "--encoding NAME", String,
125
+ "Set source document encoding. Default is to autodetect."
126
+ ) { |value| options[:encoding] = value }
127
+
128
+ opts.separator ""
129
+ opts.separator " Post-processing options:"
130
+
131
+ opts.on("-s", "--stylesheet PATH", String,
132
+ "Use custom stylesheet at PATH to add or override existing",
133
+ "CSS references in the source document."
134
+ ) { |value| options[:css] = File.expand_path(value) }
135
+
136
+ opts.on("-X", "--remove SELECTOR", String,
137
+ "Remove source element using XPath selector.",
138
+ "Use -X- to ignore stored profile."
139
+ ) { |value| value == '-' ? options[:remove] = [] : options[:remove] << value }
140
+
141
+ opts.on("-R", "--rx /PATTERN/REPLACEMENT/", String,
142
+ "Edit source HTML using regular expressions.",
143
+ "Use -R- to ignore stored profile."
144
+ ) { |value| value == '-' ? options[:rx] = [] : options[:rx] << value }
145
+
146
+ opts.on("-B", "--browse",
147
+ "After processing, open resulting HTML in default browser."
148
+ ) { |value| options[:browser] = true }
149
+
150
+ end
151
+
152
+ if args.empty?
153
+ help parser
154
+ exit 1
155
+ end
156
+
157
+ begin
158
+ parser.parse! args
159
+ rescue OptionParser::ParseError => ex
160
+ log.fatal "ERROR: #{ex.to_s}. See '#{App.name} --help'."
161
+ end
162
+
163
+ options[:url] = args.last
164
+ if options[:url].nil? || options[:url].empty?
165
+ help parser
166
+ log.fatal "ERROR: Please specify an URL."
167
+ end
168
+ end
169
+
170
+ def help(opts)
171
+ puts opts
172
+ puts
173
+ puts " Current profile (#{options[:profile]}):"
174
+ dump_profile(options[:profile])
175
+ puts
176
+ end
177
+
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,152 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+
4
+ module Repub
5
+ class App
6
+ module Parser
7
+
8
+ class ParserException < RuntimeError; end
9
+
10
+ def parse(cache)
11
+ Parser.new(options).parse(cache)
12
+ end
13
+
14
+ # Default selectors
15
+ #
16
+ Selectors = {
17
+ :title => '//h1',
18
+ :toc => '//ul',
19
+ :toc_item => './li',
20
+ :toc_section => './ul'
21
+ }
22
+
23
+ class Parser
24
+ include Logger
25
+
26
+ attr_reader :cache
27
+ attr_reader :uid
28
+ attr_reader :title
29
+ attr_reader :title_html
30
+ attr_reader :toc
31
+
32
+ def initialize(options)
33
+ @selectors = options[:selectors] || Selectors
34
+ @fixup = options[:fixup]
35
+ end
36
+
37
+ def parse(cache)
38
+ raise ParserException, "No HTML document found" if
39
+ cache.assets[:documents].empty?
40
+ raise ParserException, "More than one HTML document found, this is not supported (yet)" if
41
+ cache.assets[:documents].size > 1
42
+
43
+ @cache = cache
44
+ @asset = @cache.assets[:documents][0]
45
+ log.debug "-- Parsing #{@asset}"
46
+ @doc = Nokogiri::HTML.parse(open(File.join(@cache.path, @asset)), nil, 'UTF-8')
47
+
48
+ @uid = @cache.name
49
+ parse_title
50
+ parse_title_html
51
+ parse_toc
52
+
53
+ self
54
+ end
55
+
56
+ private
57
+
58
+ UNTITLED = 'Untitled'
59
+
60
+ def parse_title
61
+ log.debug "-- Looking for title with #{@selectors[:title]}"
62
+ el = @doc.at(@selectors[:title])
63
+ if el
64
+ if el.children.empty?
65
+ title_text = el.inner_text
66
+ else
67
+ title_text = el.children.map{|c| c.inner_text }.join(' ')
68
+ end
69
+ @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
70
+ log.info "Found title \"#{@title}\""
71
+ else
72
+ @title = UNTITLED
73
+ log.warn "** Could not find document title, using '#{@title}'"
74
+ end
75
+ end
76
+
77
+ def parse_title_html
78
+ log.debug "-- Looking for html title with #{@selectors[:title]}"
79
+ el = @doc.at(@selectors[:title])
80
+ @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
81
+ end
82
+
83
+ # Helper container for TOC items
84
+ #
85
+ class TocItem < Struct.new(
86
+ :title,
87
+ :uri,
88
+ :fragment_id
89
+ )
90
+
91
+ def initialize(title, uri_with_fragment_id, subitems, asset)
92
+ self.title = title
93
+ self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
94
+ self.uri = asset if self.uri.empty?
95
+ @subitems = subitems || []
96
+ end
97
+
98
+ attr_reader :subitems
99
+
100
+ def src
101
+ "#{uri}##{fragment_id}"
102
+ end
103
+ end
104
+
105
+ def parse_toc
106
+ log.debug "-- Looking for TOC with #{@selectors[:toc]}"
107
+ el = @doc.xpath(@selectors[:toc]).first
108
+ if el
109
+ @toc = parse_toc_section(el)
110
+ log.info "Found TOC with #{@toc.size} top-level items"
111
+ else
112
+ @toc = []
113
+ log.warn "** Could not find document table of contents"
114
+ end
115
+ end
116
+
117
+ def parse_toc_section(section)
118
+ toc = []
119
+ log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
120
+ section.xpath(@selectors[:toc_item]).each do |item|
121
+ # Get item's anchor and href
122
+ a = item.name == 'a' ? item : item.at('a')
123
+ next if !a
124
+ href = a[:href]
125
+ next if !href
126
+ # Is this a leaf item or node ?
127
+ subsection = item.xpath(@selectors[:toc_section]).first
128
+ if subsection
129
+ # Item has subsection, use anchor text for title
130
+ title = a.inner_text
131
+ else
132
+ # Leaf item, glue inner_text from all children
133
+ title = item.children.map{|c| c.inner_text }.join(' ')
134
+ end
135
+ title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
136
+ log.debug "-- Found item: #{title}"
137
+ # Parse sub-section
138
+ if subsection
139
+ log.debug "-- Found section with #{@selectors[:toc_section]}"
140
+ log.debug "-- >"
141
+ subitems = parse_toc_section(subsection)
142
+ log.debug '-- .'
143
+ end
144
+ toc << TocItem.new(title, href, subitems, @asset)
145
+ end
146
+ toc
147
+ end
148
+ end
149
+
150
+ end
151
+ end
152
+ end