invisiblellama-repub 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ pkg
2
+ tmp
3
+ .eprj
4
+ *.epub
data/History.txt ADDED
@@ -0,0 +1,3 @@
1
+ == 0.1 / 2009-06-26
2
+
3
+ * Initial release
data/README.txt ADDED
@@ -0,0 +1,95 @@
1
+ == DESCRIPTION:
2
+
3
+ RePub is a simple HTML to ePub converter.
4
+
5
+ == FEATURES/PROBLEMS:
6
+
7
+ Few samples to get started: (TODO real description)
8
+
9
+ * Project Gutenberg's THE ADVENTURES OF SHERLOCK HOLMES
10
+ repub -x 'title://div.book//h1' -x 'toc:body//table' -x 'toc_item://tr' \
11
+ -X 'body/pre,body//hr,body/h1,body/h2' \
12
+ http://www.gutenberg.org/dirs/etext99/advsh12h.htm
13
+
14
+ * Project Gutenberg's ALICE'S ADVENTURES IN WONDERLAND
15
+ repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' \
16
+ -X 'body/pre,body//hr,body/h4' \
17
+ http://www.gutenberg.org/files/11/11-h/11-h.htm
18
+
19
+ * The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
20
+ repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
21
+
22
+ * Git User's Manual
23
+ repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' \
24
+ http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
25
+
26
+ == SYNOPSIS:
27
+
28
+ Usage: repub [options] url
29
+
30
+ General options:
31
+ -D, --downloader NAME Which downloader to use to get files (wget or httrack).
32
+ Default is wget.
33
+ -o, --output PATH Output path for generated ePub file.
34
+ Default is /Users/dg/Projects/repub/<Parsed_Title>.epub
35
+ -w, --write-profile NAME Save given options for later reuse as profile NAME.
36
+ -l, --load-profile NAME Load options from saved profile NAME.
37
+ -W, --write-default Save given options for later reuse as default profile.
38
+ -L, --list-profiles List saved profiles.
39
+ -C, --cleanup Clean up download cache.
40
+ -v, --verbose Turn on verbose output.
41
+ -q, --quiet Turn off any output except errors.
42
+ -V, --version Show version.
43
+ -h, --help Show this help message.
44
+
45
+ Parser options:
46
+ -x, --selector NAME:VALUE Set parser XPath or CSS selector NAME to VALUE.
47
+ Recognized selectors are: [title toc toc_item toc_section]
48
+ -m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
49
+ Valid metadata names are: [creator date description
50
+ language publisher relation rights subject title]
51
+ -F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
52
+ Default is to try and fix things that are broken.
53
+ -e, --encoding NAME Set source document encoding. Default is to autodetect.
54
+
55
+ Post-processing options:
56
+ -s, --stylesheet PATH Use custom stylesheet at PATH to add or override existing
57
+ CSS references in the source document.
58
+ -X, --remove SELECTOR Remove source element using XPath or CSS selector.
59
+ Use -X- to ignore stored profile.
60
+ -R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
61
+ Use -R- to ignore stored profile.
62
+ -B, --browse After processing, open resulting HTML in default browser.
63
+
64
+ == REQUIREMENTS:
65
+
66
+ wget or httrack
67
+ zip (Info-ZIP)
68
+
69
+ == INSTALL:
70
+
71
+ gem install repub
72
+
73
+ == LICENSE:
74
+
75
+ The MIT License
76
+
77
+ Copyright (c) 2009 Invisible Llama
78
+
79
+ Permission is hereby granted, free of charge, to any person obtaining a copy
80
+ of this software and associated documentation files (the "Software"), to deal
81
+ in the Software without restriction, including without limitation the rights
82
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
83
+ copies of the Software, and to permit persons to whom the Software is
84
+ furnished to do so, subject to the following conditions:
85
+
86
+ The above copyright notice and this permission notice shall be included in
87
+ all copies or substantial portions of the Software.
88
+
89
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
90
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
91
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
93
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
94
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
95
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ begin
2
+ require 'bones'
3
+ Bones.setup
4
+ rescue LoadError
5
+ begin
6
+ load 'tasks/setup.rb'
7
+ rescue LoadError
8
+ raise RuntimeError, '### please install the "bones" gem ###'
9
+ end
10
+ end
11
+
12
+ ensure_in_path 'lib'
13
+ require 'repub'
14
+
15
+ task :default => 'test:run'
16
+
17
+ PROJ.name = 'repub'
18
+ PROJ.authors = 'Dmitri Goutnik'
19
+ PROJ.email = 'dg@invisiblellama.net'
20
+ PROJ.url = 'http://github.com/invisiblellama/repub/tree/master'
21
+ PROJ.version = Repub::VERSION
22
+ PROJ.rubyforge.name = 'repub'
23
+ PROJ.exclude = %w[tmp/ \.git/ \.DS_Store .*\.tmproj ^pkg/]
24
+
25
+ PROJ.spec.opts << '--color'
26
+
27
+ depend_on 'builder'
28
+ depend_on 'hpricot'
29
+ depend_on 'chardet'
30
+ depend_on 'launchy'
data/TODO.txt ADDED
@@ -0,0 +1,2 @@
1
+ √ add support for rx cleaning/modifying source doc
2
+ √ make -q/-v actually do something
data/bin/repub ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), %w[.. lib repub]))
5
+
6
+ require 'repub/app'
7
+
8
+ # THE ADVENTURES OF SHERLOCK HOLMES
9
+ # repub -x 'title:body/h1' -x 'toc:body//table' 'toc_item://tr' -X 'body/pre,body//hr,body/h1,body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
10
+ #
11
+ # ALICE'S ADVENTURES IN WONDERLAND
12
+ # repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' -X 'body/pre,body//hr,body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
13
+ #
14
+ # The Gelug-Kagyu Tradition of Mahamudra
15
+ # http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
16
+ #
17
+ # Брюс Стерлинг. Схизматрица
18
+ # repub -x 'title://h2' -x 'toc:table' -x 'toc_item://a' -X 'div,table,//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
19
+ #
20
+ # Git User's Manual
21
+ # repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
22
+
23
+
24
+ Repub::App.instance.run ARGV
data/lib/repub.rb ADDED
@@ -0,0 +1,46 @@
1
+ module Repub
2
+
3
+ # :stopdoc:
4
+ VERSION = '0.2.1'
5
+ LIBPATH = File.expand_path(File.dirname(__FILE__)) + File::SEPARATOR
6
+ PATH = File.dirname(LIBPATH) + File::SEPARATOR
7
+ # :startdoc:
8
+
9
+ # Returns the version string for the library.
10
+ #
11
+ def self.version
12
+ VERSION
13
+ end
14
+
15
+ # Returns the library path for the module. If any arguments are given,
16
+ # they will be joined to the end of the libray path using
17
+ # <tt>File.join</tt>.
18
+ #
19
+ def self.libpath( *args )
20
+ args.empty? ? LIBPATH : File.join(LIBPATH, args.flatten)
21
+ end
22
+
23
+ # Returns the lpath for the module. If any arguments are given,
24
+ # they will be joined to the end of the path using
25
+ # <tt>File.join</tt>.
26
+ #
27
+ def self.path( *args )
28
+ args.empty? ? PATH : File.join(PATH, args.flatten)
29
+ end
30
+
31
+ # Utility method used to require all files ending in .rb that lie in the
32
+ # directory below this file that has the same name as the filename passed
33
+ # in. Optionally, a specific _directory_ name can be passed in such that
34
+ # the _filename_ does not have to be equivalent to the directory.
35
+ #
36
+ def self.require_all_libs_relative_to( fname, dir = nil )
37
+ dir ||= File.basename(fname, '.*')
38
+ search_me = File.expand_path(
39
+ File.join(File.dirname(fname), dir, '**', '*.rb'))
40
+
41
+ Dir.glob(search_me).each {|rb| p rb; require rb}
42
+ end
43
+
44
+ end
45
+
46
+ $:.unshift Repub.libpath
data/lib/repub/app.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'singleton'
2
+ require 'rubygems'
3
+ require 'launchy'
4
+ require 'repub/app/utility'
5
+ require 'repub/app/options'
6
+ require 'repub/app/profile'
7
+ require 'repub/app/logger'
8
+ require 'repub/app/fetcher'
9
+ require 'repub/app/parser'
10
+ require 'repub/app/builder'
11
+
12
+ module Repub
13
+ class App
14
+ include Singleton
15
+
16
+ # Mix-in actual functionality
17
+ include Options, Profile, Fetcher, Parser, Builder, Logger
18
+
19
+ def self.name
20
+ File.basename($0)
21
+ end
22
+
23
+ def self.data_path
24
+ File.join(File.expand_path('~'), '.repub')
25
+ end
26
+
27
+ def run(args)
28
+ parse_options(args)
29
+
30
+ log.level = options[:verbosity]
31
+ log.info "Making ePub from #{options[:url]}"
32
+ res = build(parse(fetch))
33
+ log.info "Saved #{res.output_path}"
34
+
35
+ Launchy::Browser.run(res.asset_path) if options[:browser]
36
+
37
+ rescue RuntimeError => ex
38
+ log.fatal "** ERROR: #{ex.to_s}"
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,200 @@
1
+ require 'fileutils'
2
+ require 'tmpdir'
3
+ require 'repub/epub'
4
+
5
+ module Repub
6
+ class App
7
+ module Builder
8
+
9
+ class BuilderException < RuntimeError; end
10
+
11
+ def build(parser)
12
+ Builder.new(options).build(parser)
13
+ end
14
+
15
+ class Builder
16
+ include Epub, Logger
17
+
18
+ attr_reader :output_path
19
+ attr_reader :asset_path
20
+
21
+ def initialize(options)
22
+ @options = options
23
+ end
24
+
25
+ def build(parser)
26
+ @parser = parser
27
+
28
+ # Initialize content.opf
29
+ @content = Content.new(@parser.uid)
30
+ # Default title is the parsed one
31
+ @content.metadata.title = @parser.title
32
+ # Override metadata values specified in options
33
+ if @options[:metadata]
34
+ @content.metadata.members.each do |m|
35
+ m = m.to_sym
36
+ next if m == :identifier # do not allow to override uid
37
+ if @options[:metadata][m]
38
+ @content.metadata[m] = @options[:metadata][m]
39
+ log.debug "-- Setting metadata #{m} to \"#{@content.metadata[m]}\""
40
+ end
41
+ end
42
+ end
43
+
44
+ # Initialize toc.ncx
45
+ @toc = Toc.new(@parser.uid)
46
+ # TOC title is the same as in content.opf
47
+ @toc.title = @content.metadata.title
48
+
49
+ # Setup output filename and path
50
+ @output_path = File.expand_path(@options[:output_path].if_blank('.'))
51
+ if File.exist?(@output_path) && File.directory?(@output_path)
52
+ @output_path = File.join(@output_path, @content.metadata.title.gsub(/\s/, '_'))
53
+ end
54
+ @output_path = @output_path + '.epub'
55
+ log.debug "-- Setting output path to #{@output_path}"
56
+
57
+ # Build EPUB
58
+ tmpdir = Dir.mktmpdir(App::name)
59
+ begin
60
+ FileUtils.chdir(tmpdir) do
61
+ copy_and_process_assets
62
+ write_meta_inf
63
+ write_mime_type
64
+ write_content
65
+ write_toc
66
+ write_epub
67
+ end
68
+ ensure
69
+ # Keep tmp folder if we're going open processed doc in browser
70
+ FileUtils.remove_entry_secure(tmpdir) unless @options[:browser]
71
+ end
72
+ self
73
+ end
74
+
75
+ private
76
+
77
+ MetaInf = 'META-INF'
78
+
79
+ def postprocess_file(asset)
80
+ source = IO.read(asset)
81
+ # Do rx substitutions
82
+ if @options[:rx] && !@options[:rx].empty?
83
+ @options[:rx].each do |rx|
84
+ rx.strip!
85
+ delimiter = rx[0, 1]
86
+ rx = rx.gsub(/\\#{delimiter}/, "\n")
87
+ ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
88
+ raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
89
+ pattern = ra[0]
90
+ replacement = ra[1] || ''
91
+ log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
92
+ source.gsub!(Regexp.new(pattern), replacement)
93
+ end
94
+ end
95
+ # Add doctype if missing
96
+ if source !~ /\s*<!DOCTYPE/
97
+ log.debug "-- Adding missing doctype"
98
+ source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
99
+ end
100
+ # Overwrite asset with fixed version
101
+ File.open(asset, 'w') do |f|
102
+ f.write(source)
103
+ end
104
+ end
105
+
106
+ def postprocess_doc(asset)
107
+ # Do Hpricot magic if fixup is ON
108
+ doc = Hpricot(open(asset), :xhtml_strict => @options[:fixup])
109
+ # Substitute custom stylesheet
110
+ if (@options[:css] && !@options[:css].empty?)
111
+ doc.search('//link[@rel="stylesheet"]') do |link|
112
+ link[:href] = File.basename(@options[:css])
113
+ log.debug "-- Replacing CSS refs with #{link[:href]}"
114
+ end
115
+ end
116
+ # Remove elements
117
+ if @options[:remove] && !@options[:remove].empty?
118
+ @options[:remove].each do |selector|
119
+ log.info "Removing element(s) matching selector \"#{selector}\""
120
+ doc.search(selector).remove
121
+ end
122
+ end
123
+ # Overwrite asset with fixed version
124
+ File.open(asset, 'w') do |f|
125
+ f << doc.to_html
126
+ end
127
+ end
128
+
129
+ def copy_and_process_assets
130
+ # Copy html
131
+ @parser.cache.assets[:documents].each do |asset|
132
+ log.debug "-- Processing document #{asset}"
133
+ # Copy asset from cache
134
+ FileUtils.cp(File.join(@parser.cache.path, asset), '.')
135
+ # Do post-processing
136
+ postprocess_file(asset)
137
+ postprocess_doc(asset)
138
+ @content.add_document(asset)
139
+ @asset_path = File.expand_path(asset)
140
+ end
141
+ # Copy css
142
+ if @options[:css].nil? || @options[:css].empty?
143
+ # No custom css, copy one from assets
144
+ @parser.cache.assets[:stylesheets].each do |css|
145
+ log.debug "-- Copying stylesheet #{css}"
146
+ FileUtils.cp(File.join(@parser.cache.path, css), '.')
147
+ @content.add_stylesheet(css)
148
+ end
149
+ else
150
+ # Copy custom css
151
+ log.debug "-- Using custom stylesheet #{@options[:css]}"
152
+ FileUtils.cp(@options[:css], '.')
153
+ @content.add_stylesheet(File.basename(@options[:css]))
154
+ end
155
+ # Copy images
156
+ @parser.cache.assets[:images].each do |image|
157
+ log.debug "-- Copying image #{image}"
158
+ FileUtils.cp(File.join(@parser.cache.path, image), '.')
159
+ @content.add_image(image)
160
+ end
161
+ end
162
+
163
+ def write_meta_inf
164
+ FileUtils.mkdir_p(MetaInf)
165
+ FileUtils.chdir(MetaInf) do
166
+ Epub::Container.new.save
167
+ end
168
+ end
169
+
170
+ def write_mime_type
171
+ File.open('mimetype', 'w') do |f|
172
+ f << 'application/epub+zip'
173
+ end
174
+ end
175
+
176
+ def write_content
177
+ @content.save
178
+ end
179
+
180
+ def write_toc
181
+ add_nav_points(@toc.nav_map, @parser.toc)
182
+ @toc.save
183
+ end
184
+
185
+ def add_nav_points(nav_collection, toc)
186
+ toc.each do |t|
187
+ nav_point = nav_collection.add_nav_point(t.title, t.src)
188
+ add_nav_points(nav_point, t.subitems) if t.subitems
189
+ end
190
+ end
191
+
192
+ def write_epub
193
+ %x(zip -X9 \"#{@output_path}\" mimetype)
194
+ %x(zip -Xr9D \"#{@output_path}\" * -xi mimetype)
195
+ end
196
+ end
197
+
198
+ end
199
+ end
200
+ end