repub 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,9 @@
1
+ == 0.2.1 / 2009-06-26
2
+
3
+ * Initial release
4
+
5
+ == 0.3.0 / 2009-06-28
6
+
7
+ * Switched to Nokogiri for HTML parsing
8
+ * Better parsing for hierarchical TOCs
9
+ * Many bug fixes
data/README.txt ADDED
@@ -0,0 +1,106 @@
1
+ == DESCRIPTION:
2
+
3
+ Simple HTML to ePub converter.
4
+
5
+ == FEATURES/PROBLEMS:
6
+
7
+ Few samples to get started:
8
+
9
+ * Git User's Manual
10
+
11
+ repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' \
12
+ http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
13
+
14
+ * Project Gutenberg's THE ADVENTURES OF SHERLOCK HOLMES
15
+
16
+ repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' \
17
+ -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' \
18
+ http://www.gutenberg.org/dirs/etext99/advsh12h.htm
19
+
20
+ * Project Gutenberg's ALICE'S ADVENTURES IN WONDERLAND
21
+
22
+ repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' \
23
+ -X '//pre' -X '//hr' -X '//body/h4' \
24
+ http://www.gutenberg.org/files/11/11-h/11-h.htm
25
+
26
+ * The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
27
+
28
+ repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
29
+
30
+ == SYNOPSIS:
31
+
32
+ Usage: repub [options] url
33
+
34
+ General options:
35
+ -D, --downloader NAME Which downloader to use to get files (wget or httrack).
36
+ Default is wget.
37
+ -o, --output PATH Output path for generated ePub file.
38
+ Default is /Users/dg/Projects/repub/<Parsed_Title>.epub
39
+ -w, --write-profile NAME Save given options for later reuse as profile NAME.
40
+ -l, --load-profile NAME Load options from saved profile NAME.
41
+ -W, --write-default Save given options for later reuse as default profile.
42
+ -L, --list-profiles List saved profiles.
43
+ -C, --cleanup Clean up download cache.
44
+ -v, --verbose Turn on verbose output.
45
+ -q, --quiet Turn off any output except errors.
46
+ -V, --version Show version.
47
+ -h, --help Show this help message.
48
+
49
+ Parser options:
50
+ -x, --selector NAME:VALUE Set parser XPath selector NAME to VALUE.
51
+ Recognized selectors are: [title toc toc_item toc_section]
52
+ -m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
53
+ Valid metadata names are: [creator date description
54
+ language publisher relation rights subject title]
55
+ -F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
56
+ Default is to try and fix things that are broken.
57
+ -e, --encoding NAME Set source document encoding. Default is to autodetect.
58
+
59
+ Post-processing options:
60
+ -s, --stylesheet PATH Use custom stylesheet at PATH to add or override existing
61
+ CSS references in the source document.
62
+ -X, --remove SELECTOR Remove source element using XPath selector.
63
+ Use -X- to ignore stored profile.
64
+ -R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
65
+ Use -R- to ignore stored profile.
66
+ -B, --browse After processing, open resulting HTML in default browser.
67
+
68
+ == DEPENDENCIES:
69
+
70
+ * Builder (https://rubyforge.org/projects/builder/)
71
+ * Nokogiri (http://nokogiri.rubyforge.org/nokogiri/)
72
+ * rchardet (https://rubyforge.org/projects/rchardet/)
73
+ * launchy (http://copiousfreetime.rubyforge.org/launchy/)
74
+
75
+ * wget or httrack
76
+ * zip (Info-ZIP)
77
+
78
+ == INSTALL:
79
+
80
+ gem install repub
81
+
82
+ == LICENSE:
83
+
84
+ (The MIT License)
85
+
86
+ Copyright (c) 2009 Invisible Llama <dg@invisiblellama.net>
87
+
88
+ Permission is hereby granted, free of charge, to any person obtaining a copy
89
+ of this software and associated documentation files (the "Software"), to deal
90
+ in the Software without restriction, including without limitation the rights
91
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
92
+ copies of the Software, and to permit persons to whom the Software is
93
+ furnished to do so, subject to the following conditions:
94
+
95
+ The above copyright notice and this permission notice shall be included in
96
+ all copies or substantial portions of the Software.
97
+
98
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
103
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
104
+ THE SOFTWARE.
105
+
106
+ ==
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ begin
2
+ require 'bones'
3
+ Bones.setup
4
+ rescue LoadError
5
+ begin
6
+ load 'tasks/setup.rb'
7
+ rescue LoadError
8
+ raise RuntimeError, '### please install the "bones" gem ###'
9
+ end
10
+ end
11
+
12
+ ensure_in_path 'lib'
13
+ require 'repub'
14
+
15
+ task :default => 'test:run'
16
+
17
+ PROJ.name = 'repub'
18
+ PROJ.authors = 'Dmitri Goutnik'
19
+ PROJ.email = 'dg@invisiblellama.net'
20
+ PROJ.url = 'http://github.com/invisiblellama/repub/tree/master'
21
+ PROJ.version = Repub::VERSION
22
+ PROJ.rubyforge.name = 'repub'
23
+ PROJ.exclude = %w[tmp/ \.git \.DS_Store .*\.tmproj .*\.epub ^pkg/]
24
+
25
+ PROJ.spec.opts << '--color'
26
+
27
+ depend_on 'nokogiri'
28
+ depend_on 'builder'
29
+ depend_on 'chardet'
30
+ depend_on 'launchy'
data/SAMPLES.txt ADDED
@@ -0,0 +1,23 @@
1
+ * THE ADVENTURES OF SHERLOCK HOLMES
2
+
3
+ repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
4
+
5
+ * ALICE'S ADVENTURES IN WONDERLAND
6
+
7
+ repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
8
+
9
+ * The Gelug-Kagyu Tradition of Mahamudra
10
+
11
+ repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
12
+
13
+ * Брюс Стерлинг. Схизматрица
14
+
15
+ repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
16
+
17
+ * Айзек Азимов. Космические течения
18
+
19
+ repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/FOUNDATION/currspac.txt_with-big-pictures.html
20
+
21
+ * Git User's Manual
22
+
23
+ repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ √ add support for rx cleaning/modifying source doc
2
+ √ make -q/-v actually do something
3
+ more parser tokens: author(s) etc
data/bin/repub ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), %w[.. lib repub]))
5
+
6
+ require 'repub/app'
7
+
8
+ Repub::App.instance.run ARGV
data/lib/repub.rb ADDED
@@ -0,0 +1,46 @@
1
+ module Repub
2
+
3
+ # :stopdoc:
4
+ VERSION = '0.3.0'
5
+ LIBPATH = File.expand_path(File.dirname(__FILE__)) + File::SEPARATOR
6
+ PATH = File.dirname(LIBPATH) + File::SEPARATOR
7
+ # :startdoc:
8
+
9
+ # Returns the version string for the library.
10
+ #
11
+ def self.version
12
+ VERSION
13
+ end
14
+
15
+ # Returns the library path for the module. If any arguments are given,
16
+ # they will be joined to the end of the libray path using
17
+ # <tt>File.join</tt>.
18
+ #
19
+ def self.libpath( *args )
20
+ args.empty? ? LIBPATH : File.join(LIBPATH, args.flatten)
21
+ end
22
+
23
+ # Returns the lpath for the module. If any arguments are given,
24
+ # they will be joined to the end of the path using
25
+ # <tt>File.join</tt>.
26
+ #
27
+ def self.path( *args )
28
+ args.empty? ? PATH : File.join(PATH, args.flatten)
29
+ end
30
+
31
+ # Utility method used to require all files ending in .rb that lie in the
32
+ # directory below this file that has the same name as the filename passed
33
+ # in. Optionally, a specific _directory_ name can be passed in such that
34
+ # the _filename_ does not have to be equivalent to the directory.
35
+ #
36
+ def self.require_all_libs_relative_to( fname, dir = nil )
37
+ dir ||= File.basename(fname, '.*')
38
+ search_me = File.expand_path(
39
+ File.join(File.dirname(fname), dir, '**', '*.rb'))
40
+
41
+ Dir.glob(search_me).each {|rb| p rb; require rb}
42
+ end
43
+
44
+ end
45
+
46
+ $:.unshift Repub.libpath
data/lib/repub/app.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'singleton'
2
+ require 'rubygems'
3
+ require 'launchy'
4
+ require 'repub/app/utility'
5
+ require 'repub/app/logger'
6
+ require 'repub/app/options'
7
+ require 'repub/app/profile'
8
+ require 'repub/app/fetcher'
9
+ require 'repub/app/parser'
10
+ require 'repub/app/builder'
11
+
12
+ module Repub
13
+ class App
14
+ include Singleton
15
+
16
+ # Mix-in actual functionality
17
+ include Options, Profile, Fetcher, Parser, Builder, Logger
18
+
19
+ def self.name
20
+ File.basename($0)
21
+ end
22
+
23
+ def self.data_path
24
+ File.join(File.expand_path('~'), '.repub')
25
+ end
26
+
27
+ def run(args)
28
+ parse_options(args)
29
+
30
+ log.level = options[:verbosity]
31
+ log.info "Making ePub from #{options[:url]}"
32
+ res = build(parse(fetch))
33
+ log.info "Saved #{res.output_path}"
34
+
35
+ Launchy::Browser.run(res.asset_path) if options[:browser]
36
+
37
+ rescue RuntimeError => ex
38
+ log.fatal "** ERROR: #{ex.to_s}"
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,208 @@
1
+ require 'fileutils'
2
+ require 'tmpdir'
3
+ require 'repub/epub'
4
+
5
+ module Repub
6
+ class App
7
+ module Builder
8
+
9
+ class BuilderException < RuntimeError; end
10
+
11
+ def build(parser)
12
+ Builder.new(options).build(parser)
13
+ end
14
+
15
+ class Builder
16
+ include Epub, Logger
17
+
18
+ attr_reader :output_path
19
+ attr_reader :asset_path
20
+
21
+ def initialize(options)
22
+ @options = options
23
+ end
24
+
25
+ def build(parser)
26
+ @parser = parser
27
+
28
+ # Initialize content.opf
29
+ @content = Content.new(@parser.uid)
30
+ # Default title is the parsed one
31
+ @content.metadata.title = @parser.title
32
+ # Override metadata values specified in options
33
+ if @options[:metadata]
34
+ @content.metadata.members.each do |m|
35
+ m = m.to_sym
36
+ next if m == :identifier # do not allow to override uid
37
+ if @options[:metadata][m]
38
+ @content.metadata[m] = @options[:metadata][m]
39
+ log.debug "-- Setting metadata #{m} to \"#{@content.metadata[m]}\""
40
+ end
41
+ end
42
+ end
43
+
44
+ # Initialize toc.ncx
45
+ @toc = Toc.new(@parser.uid)
46
+ # TOC title is the same as in content.opf
47
+ @toc.title = @content.metadata.title
48
+
49
+ # Setup output filename and path
50
+ @output_path = File.expand_path(@options[:output_path].if_blank('.'))
51
+ if File.exist?(@output_path) && File.directory?(@output_path)
52
+ @output_path = File.join(@output_path, @content.metadata.title.gsub(/\s/, '_'))
53
+ end
54
+ @output_path = @output_path + '.epub'
55
+ log.debug "-- Setting output path to #{@output_path}"
56
+
57
+ # Build EPUB
58
+ tmpdir = Dir.mktmpdir(App::name)
59
+ begin
60
+ FileUtils.chdir(tmpdir) do
61
+ copy_and_process_assets
62
+ write_meta_inf
63
+ write_mime_type
64
+ write_content
65
+ write_toc
66
+ write_epub
67
+ end
68
+ ensure
69
+ # Keep tmp folder if we're going open processed doc in browser
70
+ FileUtils.remove_entry_secure(tmpdir) unless @options[:browser]
71
+ end
72
+ self
73
+ end
74
+
75
+ private
76
+
77
+ MetaInf = 'META-INF'
78
+
79
+ def postprocess_file(asset)
80
+ source = IO.read(asset)
81
+ # Do rx substitutions
82
+ if @options[:rx] && !@options[:rx].empty?
83
+ @options[:rx].each do |rx|
84
+ rx.strip!
85
+ delimiter = rx[0, 1]
86
+ rx = rx.gsub(/\\#{delimiter}/, "\n")
87
+ ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
88
+ raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
89
+ pattern = ra[0]
90
+ replacement = ra[1] || ''
91
+ log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
92
+ source.gsub!(Regexp.new(pattern), replacement)
93
+ end
94
+ end
95
+ # Add doctype if missing
96
+ if source !~ /\s*<!DOCTYPE/
97
+ log.debug "-- Adding missing doctype"
98
+ source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
99
+ end
100
+ # Save processed file
101
+ File.open(asset, 'w') do |f|
102
+ f.write(source)
103
+ end
104
+ end
105
+
106
+ def postprocess_doc(asset)
107
+ doc = Nokogiri::HTML.parse(open(asset), nil, 'UTF-8')
108
+ # Substitute custom CSS
109
+ if (@options[:css] && !@options[:css].empty?)
110
+ doc.xpath('//link[@rel="stylesheet"]') do |link|
111
+ link[:href] = File.basename(@options[:css])
112
+ log.debug "-- Replacing CSS refs with #{link[:href]}"
113
+ end
114
+ end
115
+ # Remove elements
116
+ if @options[:remove] && !@options[:remove].empty?
117
+ @options[:remove].each do |selector|
118
+ log.info "Removing elements matching selector \"#{selector}\""
119
+ #p doc.search(selector).size
120
+ #p doc.search(selector)
121
+ doc.search(selector).remove
122
+ end
123
+ end
124
+ # Save processed doc
125
+ File.open(asset, 'w') do |f|
126
+ if @options[:fixup]
127
+ # HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
128
+ # in html node and adds them anyway. Just remove them here to avoid duplicates.
129
+ doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
130
+ doc.write_xhtml_to(f, :encoding => 'UTF-8')
131
+ else
132
+ doc.write_html_to(f, :encoding => 'UTF-8')
133
+ end
134
+ end
135
+ end
136
+
137
+ def copy_and_process_assets
138
+ # Copy html
139
+ @parser.cache.assets[:documents].each do |asset|
140
+ log.debug "-- Processing document #{asset}"
141
+ # Copy asset from cache
142
+ FileUtils.cp(File.join(@parser.cache.path, asset), '.')
143
+ # Do post-processing
144
+ postprocess_file(asset)
145
+ postprocess_doc(asset)
146
+ @content.add_document(asset)
147
+ @asset_path = File.expand_path(asset)
148
+ end
149
+ # Copy css
150
+ if @options[:css].nil? || @options[:css].empty?
151
+ # No custom css, copy one from assets
152
+ @parser.cache.assets[:stylesheets].each do |css|
153
+ log.debug "-- Copying stylesheet #{css}"
154
+ FileUtils.cp(File.join(@parser.cache.path, css), '.')
155
+ @content.add_stylesheet(css)
156
+ end
157
+ else
158
+ # Copy custom css
159
+ log.debug "-- Using custom stylesheet #{@options[:css]}"
160
+ FileUtils.cp(@options[:css], '.')
161
+ @content.add_stylesheet(File.basename(@options[:css]))
162
+ end
163
+ # Copy images
164
+ @parser.cache.assets[:images].each do |image|
165
+ log.debug "-- Copying image #{image}"
166
+ FileUtils.cp(File.join(@parser.cache.path, image), '.')
167
+ @content.add_image(image)
168
+ end
169
+ end
170
+
171
+ def write_meta_inf
172
+ FileUtils.mkdir_p(MetaInf)
173
+ FileUtils.chdir(MetaInf) do
174
+ Epub::Container.new.save
175
+ end
176
+ end
177
+
178
+ def write_mime_type
179
+ File.open('mimetype', 'w') do |f|
180
+ f << 'application/epub+zip'
181
+ end
182
+ end
183
+
184
+ def write_content
185
+ @content.save
186
+ end
187
+
188
+ def write_toc
189
+ add_nav_points(@toc.nav_map, @parser.toc)
190
+ @toc.save
191
+ end
192
+
193
+ def add_nav_points(nav_collection, toc)
194
+ toc.each do |t|
195
+ nav_point = nav_collection.add_nav_point(t.title, t.src)
196
+ add_nav_points(nav_point, t.subitems) if t.subitems
197
+ end
198
+ end
199
+
200
+ def write_epub
201
+ %x(zip -X9 \"#{@output_path}\" mimetype)
202
+ %x(zip -Xr9D \"#{@output_path}\" * -xi mimetype)
203
+ end
204
+ end
205
+
206
+ end
207
+ end
208
+ end