repub 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,9 @@
1
+ == 0.2.1 / 2009-06-26
2
+
3
+ * Initial release
4
+
5
+ == 0.3.0 / 2009-06-28
6
+
7
+ * Switched to Nokogiri for HTML parsing
8
+ * Better parsing for hierarchical TOCs
9
+ * Many bug fixes
data/README.txt ADDED
@@ -0,0 +1,106 @@
1
+ == DESCRIPTION:
2
+
3
+ Simple HTML to ePub converter.
4
+
5
+ == FEATURES/PROBLEMS:
6
+
7
+ Few samples to get started:
8
+
9
+ * Git User's Manual
10
+
11
+ repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' \
12
+ http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
13
+
14
+ * Project Gutenberg's THE ADVENTURES OF SHERLOCK HOLMES
15
+
16
+ repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' \
17
+ -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' \
18
+ http://www.gutenberg.org/dirs/etext99/advsh12h.htm
19
+
20
+ * Project Gutenberg's ALICE'S ADVENTURES IN WONDERLAND
21
+
22
+ repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' \
23
+ -X '//pre' -X '//hr' -X '//body/h4' \
24
+ http://www.gutenberg.org/files/11/11-h/11-h.htm
25
+
26
+ * The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
27
+
28
+ repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
29
+
30
+ == SYNOPSIS:
31
+
32
+ Usage: repub [options] url
33
+
34
+ General options:
35
+ -D, --downloader NAME Which downloader to use to get files (wget or httrack).
36
+ Default is wget.
37
+ -o, --output PATH Output path for generated ePub file.
38
+ Default is /Users/dg/Projects/repub/<Parsed_Title>.epub
39
+ -w, --write-profile NAME Save given options for later reuse as profile NAME.
40
+ -l, --load-profile NAME Load options from saved profile NAME.
41
+ -W, --write-default Save given options for later reuse as default profile.
42
+ -L, --list-profiles List saved profiles.
43
+ -C, --cleanup Clean up download cache.
44
+ -v, --verbose Turn on verbose output.
45
+ -q, --quiet Turn off any output except errors.
46
+ -V, --version Show version.
47
+ -h, --help Show this help message.
48
+
49
+ Parser options:
50
+ -x, --selector NAME:VALUE Set parser XPath selector NAME to VALUE.
51
+ Recognized selectors are: [title toc toc_item toc_section]
52
+ -m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
53
+ Valid metadata names are: [creator date description
54
+ language publisher relation rights subject title]
55
+ -F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
56
+ Default is to try and fix things that are broken.
57
+ -e, --encoding NAME Set source document encoding. Default is to autodetect.
58
+
59
+ Post-processing options:
60
+ -s, --stylesheet PATH Use custom stylesheet at PATH to add or override existing
61
+ CSS references in the source document.
62
+ -X, --remove SELECTOR Remove source element using XPath selector.
63
+ Use -X- to ignore stored profile.
64
+ -R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
65
+ Use -R- to ignore stored profile.
66
+ -B, --browse After processing, open resulting HTML in default browser.
67
+
68
+ == DEPENDENCIES:
69
+
70
+ * Builder (https://rubyforge.org/projects/builder/)
71
+ * Nokogiri (http://nokogiri.rubyforge.org/nokogiri/)
72
+ * rchardet (https://rubyforge.org/projects/rchardet/)
73
+ * launchy (http://copiousfreetime.rubyforge.org/launchy/)
74
+
75
+ * wget or httrack
76
+ * zip (Info-ZIP)
77
+
78
+ == INSTALL:
79
+
80
+ gem install repub
81
+
82
+ == LICENSE:
83
+
84
+ (The MIT License)
85
+
86
+ Copyright (c) 2009 Invisible Llama <dg@invisiblellama.net>
87
+
88
+ Permission is hereby granted, free of charge, to any person obtaining a copy
89
+ of this software and associated documentation files (the "Software"), to deal
90
+ in the Software without restriction, including without limitation the rights
91
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
92
+ copies of the Software, and to permit persons to whom the Software is
93
+ furnished to do so, subject to the following conditions:
94
+
95
+ The above copyright notice and this permission notice shall be included in
96
+ all copies or substantial portions of the Software.
97
+
98
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
103
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
104
+ THE SOFTWARE.
105
+
106
+ ==
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ begin
2
+ require 'bones'
3
+ Bones.setup
4
+ rescue LoadError
5
+ begin
6
+ load 'tasks/setup.rb'
7
+ rescue LoadError
8
+ raise RuntimeError, '### please install the "bones" gem ###'
9
+ end
10
+ end
11
+
12
+ ensure_in_path 'lib'
13
+ require 'repub'
14
+
15
+ task :default => 'test:run'
16
+
17
+ PROJ.name = 'repub'
18
+ PROJ.authors = 'Dmitri Goutnik'
19
+ PROJ.email = 'dg@invisiblellama.net'
20
+ PROJ.url = 'http://github.com/invisiblellama/repub/tree/master'
21
+ PROJ.version = Repub::VERSION
22
+ PROJ.rubyforge.name = 'repub'
23
+ PROJ.exclude = %w[tmp/ \.git \.DS_Store .*\.tmproj .*\.epub ^pkg/]
24
+
25
+ PROJ.spec.opts << '--color'
26
+
27
+ depend_on 'nokogiri'
28
+ depend_on 'builder'
29
+ depend_on 'chardet'
30
+ depend_on 'launchy'
data/SAMPLES.txt ADDED
@@ -0,0 +1,23 @@
1
+ * THE ADVENTURES OF SHERLOCK HOLMES
2
+
3
+ repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
4
+
5
+ * ALICE'S ADVENTURES IN WONDERLAND
6
+
7
+ repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
8
+
9
+ * The Gelug-Kagyu Tradition of Mahamudra
10
+
11
+ repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
12
+
13
+ * Брюс Стерлинг. Схизматрица
14
+
15
+ repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
16
+
17
+ * Айзек Азимов. Космические течения
18
+
19
+ repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/FOUNDATION/currspac.txt_with-big-pictures.html
20
+
21
+ * Git User's Manual
22
+
23
+ repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ √ add support for rx cleaning/modifying source doc
2
+ √ make -q/-v actually do something
3
+ more parser tokens: author(s) etc
data/bin/repub ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), %w[.. lib repub]))
5
+
6
+ require 'repub/app'
7
+
8
+ Repub::App.instance.run ARGV
data/lib/repub.rb ADDED
@@ -0,0 +1,46 @@
1
+ module Repub
2
+
3
+ # :stopdoc:
4
+ VERSION = '0.3.0'
5
+ LIBPATH = File.expand_path(File.dirname(__FILE__)) + File::SEPARATOR
6
+ PATH = File.dirname(LIBPATH) + File::SEPARATOR
7
+ # :startdoc:
8
+
9
+ # Returns the version string for the library.
10
+ #
11
+ def self.version
12
+ VERSION
13
+ end
14
+
15
+ # Returns the library path for the module. If any arguments are given,
16
+ # they will be joined to the end of the libray path using
17
+ # <tt>File.join</tt>.
18
+ #
19
+ def self.libpath( *args )
20
+ args.empty? ? LIBPATH : File.join(LIBPATH, args.flatten)
21
+ end
22
+
23
+ # Returns the lpath for the module. If any arguments are given,
24
+ # they will be joined to the end of the path using
25
+ # <tt>File.join</tt>.
26
+ #
27
+ def self.path( *args )
28
+ args.empty? ? PATH : File.join(PATH, args.flatten)
29
+ end
30
+
31
+ # Utility method used to require all files ending in .rb that lie in the
32
+ # directory below this file that has the same name as the filename passed
33
+ # in. Optionally, a specific _directory_ name can be passed in such that
34
+ # the _filename_ does not have to be equivalent to the directory.
35
+ #
36
+ def self.require_all_libs_relative_to( fname, dir = nil )
37
+ dir ||= File.basename(fname, '.*')
38
+ search_me = File.expand_path(
39
+ File.join(File.dirname(fname), dir, '**', '*.rb'))
40
+
41
+ Dir.glob(search_me).each {|rb| p rb; require rb}
42
+ end
43
+
44
+ end
45
+
46
+ $:.unshift Repub.libpath
data/lib/repub/app.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'singleton'
2
+ require 'rubygems'
3
+ require 'launchy'
4
+ require 'repub/app/utility'
5
+ require 'repub/app/logger'
6
+ require 'repub/app/options'
7
+ require 'repub/app/profile'
8
+ require 'repub/app/fetcher'
9
+ require 'repub/app/parser'
10
+ require 'repub/app/builder'
11
+
12
+ module Repub
13
+ class App
14
+ include Singleton
15
+
16
+ # Mix-in actual functionality
17
+ include Options, Profile, Fetcher, Parser, Builder, Logger
18
+
19
+ def self.name
20
+ File.basename($0)
21
+ end
22
+
23
+ def self.data_path
24
+ File.join(File.expand_path('~'), '.repub')
25
+ end
26
+
27
+ def run(args)
28
+ parse_options(args)
29
+
30
+ log.level = options[:verbosity]
31
+ log.info "Making ePub from #{options[:url]}"
32
+ res = build(parse(fetch))
33
+ log.info "Saved #{res.output_path}"
34
+
35
+ Launchy::Browser.run(res.asset_path) if options[:browser]
36
+
37
+ rescue RuntimeError => ex
38
+ log.fatal "** ERROR: #{ex.to_s}"
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,208 @@
1
+ require 'fileutils'
2
+ require 'tmpdir'
3
+ require 'repub/epub'
4
+
5
+ module Repub
6
+ class App
7
+ module Builder
8
+
9
+ class BuilderException < RuntimeError; end
10
+
11
+ def build(parser)
12
+ Builder.new(options).build(parser)
13
+ end
14
+
15
+ class Builder
16
+ include Epub, Logger
17
+
18
+ attr_reader :output_path
19
+ attr_reader :asset_path
20
+
21
+ def initialize(options)
22
+ @options = options
23
+ end
24
+
25
+ def build(parser)
26
+ @parser = parser
27
+
28
+ # Initialize content.opf
29
+ @content = Content.new(@parser.uid)
30
+ # Default title is the parsed one
31
+ @content.metadata.title = @parser.title
32
+ # Override metadata values specified in options
33
+ if @options[:metadata]
34
+ @content.metadata.members.each do |m|
35
+ m = m.to_sym
36
+ next if m == :identifier # do not allow to override uid
37
+ if @options[:metadata][m]
38
+ @content.metadata[m] = @options[:metadata][m]
39
+ log.debug "-- Setting metadata #{m} to \"#{@content.metadata[m]}\""
40
+ end
41
+ end
42
+ end
43
+
44
+ # Initialize toc.ncx
45
+ @toc = Toc.new(@parser.uid)
46
+ # TOC title is the same as in content.opf
47
+ @toc.title = @content.metadata.title
48
+
49
+ # Setup output filename and path
50
+ @output_path = File.expand_path(@options[:output_path].if_blank('.'))
51
+ if File.exist?(@output_path) && File.directory?(@output_path)
52
+ @output_path = File.join(@output_path, @content.metadata.title.gsub(/\s/, '_'))
53
+ end
54
+ @output_path = @output_path + '.epub'
55
+ log.debug "-- Setting output path to #{@output_path}"
56
+
57
+ # Build EPUB
58
+ tmpdir = Dir.mktmpdir(App::name)
59
+ begin
60
+ FileUtils.chdir(tmpdir) do
61
+ copy_and_process_assets
62
+ write_meta_inf
63
+ write_mime_type
64
+ write_content
65
+ write_toc
66
+ write_epub
67
+ end
68
+ ensure
69
+ # Keep tmp folder if we're going open processed doc in browser
70
+ FileUtils.remove_entry_secure(tmpdir) unless @options[:browser]
71
+ end
72
+ self
73
+ end
74
+
75
+ private
76
+
77
+ MetaInf = 'META-INF'
78
+
79
+ def postprocess_file(asset)
80
+ source = IO.read(asset)
81
+ # Do rx substitutions
82
+ if @options[:rx] && !@options[:rx].empty?
83
+ @options[:rx].each do |rx|
84
+ rx.strip!
85
+ delimiter = rx[0, 1]
86
+ rx = rx.gsub(/\\#{delimiter}/, "\n")
87
+ ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
88
+ raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
89
+ pattern = ra[0]
90
+ replacement = ra[1] || ''
91
+ log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
92
+ source.gsub!(Regexp.new(pattern), replacement)
93
+ end
94
+ end
95
+ # Add doctype if missing
96
+ if source !~ /\s*<!DOCTYPE/
97
+ log.debug "-- Adding missing doctype"
98
+ source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
99
+ end
100
+ # Save processed file
101
+ File.open(asset, 'w') do |f|
102
+ f.write(source)
103
+ end
104
+ end
105
+
106
+ def postprocess_doc(asset)
107
+ doc = Nokogiri::HTML.parse(open(asset), nil, 'UTF-8')
108
+ # Substitute custom CSS
109
+ if (@options[:css] && !@options[:css].empty?)
110
+ doc.xpath('//link[@rel="stylesheet"]') do |link|
111
+ link[:href] = File.basename(@options[:css])
112
+ log.debug "-- Replacing CSS refs with #{link[:href]}"
113
+ end
114
+ end
115
+ # Remove elements
116
+ if @options[:remove] && !@options[:remove].empty?
117
+ @options[:remove].each do |selector|
118
+ log.info "Removing elements matching selector \"#{selector}\""
119
+ #p doc.search(selector).size
120
+ #p doc.search(selector)
121
+ doc.search(selector).remove
122
+ end
123
+ end
124
+ # Save processed doc
125
+ File.open(asset, 'w') do |f|
126
+ if @options[:fixup]
127
+ # HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
128
+ # in html node and adds them anyway. Just remove them here to avoid duplicates.
129
+ doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
130
+ doc.write_xhtml_to(f, :encoding => 'UTF-8')
131
+ else
132
+ doc.write_html_to(f, :encoding => 'UTF-8')
133
+ end
134
+ end
135
+ end
136
+
137
+ def copy_and_process_assets
138
+ # Copy html
139
+ @parser.cache.assets[:documents].each do |asset|
140
+ log.debug "-- Processing document #{asset}"
141
+ # Copy asset from cache
142
+ FileUtils.cp(File.join(@parser.cache.path, asset), '.')
143
+ # Do post-processing
144
+ postprocess_file(asset)
145
+ postprocess_doc(asset)
146
+ @content.add_document(asset)
147
+ @asset_path = File.expand_path(asset)
148
+ end
149
+ # Copy css
150
+ if @options[:css].nil? || @options[:css].empty?
151
+ # No custom css, copy one from assets
152
+ @parser.cache.assets[:stylesheets].each do |css|
153
+ log.debug "-- Copying stylesheet #{css}"
154
+ FileUtils.cp(File.join(@parser.cache.path, css), '.')
155
+ @content.add_stylesheet(css)
156
+ end
157
+ else
158
+ # Copy custom css
159
+ log.debug "-- Using custom stylesheet #{@options[:css]}"
160
+ FileUtils.cp(@options[:css], '.')
161
+ @content.add_stylesheet(File.basename(@options[:css]))
162
+ end
163
+ # Copy images
164
+ @parser.cache.assets[:images].each do |image|
165
+ log.debug "-- Copying image #{image}"
166
+ FileUtils.cp(File.join(@parser.cache.path, image), '.')
167
+ @content.add_image(image)
168
+ end
169
+ end
170
+
171
+ def write_meta_inf
172
+ FileUtils.mkdir_p(MetaInf)
173
+ FileUtils.chdir(MetaInf) do
174
+ Epub::Container.new.save
175
+ end
176
+ end
177
+
178
+ def write_mime_type
179
+ File.open('mimetype', 'w') do |f|
180
+ f << 'application/epub+zip'
181
+ end
182
+ end
183
+
184
+ def write_content
185
+ @content.save
186
+ end
187
+
188
+ def write_toc
189
+ add_nav_points(@toc.nav_map, @parser.toc)
190
+ @toc.save
191
+ end
192
+
193
+ def add_nav_points(nav_collection, toc)
194
+ toc.each do |t|
195
+ nav_point = nav_collection.add_nav_point(t.title, t.src)
196
+ add_nav_points(nav_point, t.subitems) if t.subitems
197
+ end
198
+ end
199
+
200
+ def write_epub
201
+ %x(zip -X9 \"#{@output_path}\" mimetype)
202
+ %x(zip -Xr9D \"#{@output_path}\" * -xi mimetype)
203
+ end
204
+ end
205
+
206
+ end
207
+ end
208
+ end