invisiblellama-repub 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/History.txt +3 -0
- data/README.txt +95 -0
- data/Rakefile +30 -0
- data/TODO.txt +2 -0
- data/bin/repub +24 -0
- data/lib/repub.rb +46 -0
- data/lib/repub/app.rb +42 -0
- data/lib/repub/app/builder.rb +200 -0
- data/lib/repub/app/fetcher.rb +162 -0
- data/lib/repub/app/logger.rb +52 -0
- data/lib/repub/app/options.rb +173 -0
- data/lib/repub/app/parser.rb +139 -0
- data/lib/repub/app/profile.rb +91 -0
- data/lib/repub/app/utility.rb +57 -0
- data/lib/repub/epub.rb +3 -0
- data/lib/repub/epub/container.rb +28 -0
- data/lib/repub/epub/content.rb +153 -0
- data/lib/repub/epub/toc.rb +139 -0
- data/lib/repub/mobi/.githidden +0 -0
- data/test/epub/test_container.rb +15 -0
- data/test/epub/test_content.rb +56 -0
- data/test/epub/test_toc.rb +29 -0
- data/test/test_builder.rb +8 -0
- data/test/test_fetcher.rb +36 -0
- data/test/test_logger.rb +76 -0
- data/test/test_parser.rb +32 -0
- metadata +139 -0
data/.gitignore
ADDED
data/History.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
== DESCRIPTION:
|
2
|
+
|
3
|
+
RePub is a simple HTML to ePub converter.
|
4
|
+
|
5
|
+
== FEATURES/PROBLEMS:
|
6
|
+
|
7
|
+
Few samples to get started: (TODO real description)
|
8
|
+
|
9
|
+
* Project Gutenberg's THE ADVENTURES OF SHERLOCK HOLMES
|
10
|
+
repub -x 'title://div.book//h1' -x 'toc:body//table' -x 'toc_item://tr' \
|
11
|
+
-X 'body/pre,body//hr,body/h1,body/h2' \
|
12
|
+
http://www.gutenberg.org/dirs/etext99/advsh12h.htm
|
13
|
+
|
14
|
+
* Project Gutenberg's ALICE'S ADVENTURES IN WONDERLAND
|
15
|
+
repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' \
|
16
|
+
-X 'body/pre,body//hr,body/h4' \
|
17
|
+
http://www.gutenberg.org/files/11/11-h/11-h.htm
|
18
|
+
|
19
|
+
* The Gelug-Kagyu Tradition of Mahamudra from Berzin Archives
|
20
|
+
repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
21
|
+
|
22
|
+
* Git User's Manual
|
23
|
+
repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' \
|
24
|
+
http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
25
|
+
|
26
|
+
== SYNOPSIS:
|
27
|
+
|
28
|
+
Usage: repub [options] url
|
29
|
+
|
30
|
+
General options:
|
31
|
+
-D, --downloader NAME Which downloader to use to get files (wget or httrack).
|
32
|
+
Default is wget.
|
33
|
+
-o, --output PATH Output path for generated ePub file.
|
34
|
+
Default is /Users/dg/Projects/repub/<Parsed_Title>.epub
|
35
|
+
-w, --write-profile NAME Save given options for later reuse as profile NAME.
|
36
|
+
-l, --load-profile NAME Load options from saved profile NAME.
|
37
|
+
-W, --write-default Save given options for later reuse as default profile.
|
38
|
+
-L, --list-profiles List saved profiles.
|
39
|
+
-C, --cleanup Clean up download cache.
|
40
|
+
-v, --verbose Turn on verbose output.
|
41
|
+
-q, --quiet Turn off any output except errors.
|
42
|
+
-V, --version Show version.
|
43
|
+
-h, --help Show this help message.
|
44
|
+
|
45
|
+
Parser options:
|
46
|
+
-x, --selector NAME:VALUE Set parser XPath or CSS selector NAME to VALUE.
|
47
|
+
Recognized selectors are: [title toc toc_item toc_section]
|
48
|
+
-m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
|
49
|
+
Valid metadata names are: [creator date description
|
50
|
+
language publisher relation rights subject title]
|
51
|
+
-F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
|
52
|
+
Default is to try and fix things that are broken.
|
53
|
+
-e, --encoding NAME Set source document encoding. Default is to autodetect.
|
54
|
+
|
55
|
+
Post-processing options:
|
56
|
+
-s, --stylesheet PATH Use custom stylesheet at PATH to add or override existing
|
57
|
+
CSS references in the source document.
|
58
|
+
-X, --remove SELECTOR Remove source element using XPath or CSS selector.
|
59
|
+
Use -X- to ignore stored profile.
|
60
|
+
-R, --rx /PATTERN/REPLACEMENT/ Edit source HTML using regular expressions.
|
61
|
+
Use -R- to ignore stored profile.
|
62
|
+
-B, --browse After processing, open resulting HTML in default browser.
|
63
|
+
|
64
|
+
== REQUIREMENTS:
|
65
|
+
|
66
|
+
wget or httrack
|
67
|
+
zip (Info-ZIP)
|
68
|
+
|
69
|
+
== INSTALL:
|
70
|
+
|
71
|
+
gem install repub
|
72
|
+
|
73
|
+
== LICENSE:
|
74
|
+
|
75
|
+
The MIT License
|
76
|
+
|
77
|
+
Copyright (c) 2009 Invisible Llama
|
78
|
+
|
79
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
80
|
+
of this software and associated documentation files (the "Software"), to deal
|
81
|
+
in the Software without restriction, including without limitation the rights
|
82
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
83
|
+
copies of the Software, and to permit persons to whom the Software is
|
84
|
+
furnished to do so, subject to the following conditions:
|
85
|
+
|
86
|
+
The above copyright notice and this permission notice shall be included in
|
87
|
+
all copies or substantial portions of the Software.
|
88
|
+
|
89
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
90
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
91
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
92
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
93
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
94
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
95
|
+
THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
begin
|
2
|
+
require 'bones'
|
3
|
+
Bones.setup
|
4
|
+
rescue LoadError
|
5
|
+
begin
|
6
|
+
load 'tasks/setup.rb'
|
7
|
+
rescue LoadError
|
8
|
+
raise RuntimeError, '### please install the "bones" gem ###'
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
ensure_in_path 'lib'
|
13
|
+
require 'repub'
|
14
|
+
|
15
|
+
task :default => 'test:run'
|
16
|
+
|
17
|
+
PROJ.name = 'repub'
|
18
|
+
PROJ.authors = 'Dmitri Goutnik'
|
19
|
+
PROJ.email = 'dg@invisiblellama.net'
|
20
|
+
PROJ.url = 'http://github.com/invisiblellama/repub/tree/master'
|
21
|
+
PROJ.version = Repub::VERSION
|
22
|
+
PROJ.rubyforge.name = 'repub'
|
23
|
+
PROJ.exclude = %w[tmp/ \.git/ \.DS_Store .*\.tmproj ^pkg/]
|
24
|
+
|
25
|
+
PROJ.spec.opts << '--color'
|
26
|
+
|
27
|
+
depend_on 'builder'
|
28
|
+
depend_on 'hpricot'
|
29
|
+
depend_on 'chardet'
|
30
|
+
depend_on 'launchy'
|
data/TODO.txt
ADDED
data/bin/repub
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path(
|
4
|
+
File.join(File.dirname(__FILE__), %w[.. lib repub]))
|
5
|
+
|
6
|
+
require 'repub/app'
|
7
|
+
|
8
|
+
# THE ADVENTURES OF SHERLOCK HOLMES
|
9
|
+
# repub -x 'title:body/h1' -x 'toc:body//table' 'toc_item://tr' -X 'body/pre,body//hr,body/h1,body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
|
10
|
+
#
|
11
|
+
# ALICE'S ADVENTURES IN WONDERLAND
|
12
|
+
# repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' -X 'body/pre,body//hr,body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
|
13
|
+
#
|
14
|
+
# The Gelug-Kagyu Tradition of Mahamudra
|
15
|
+
# http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
|
16
|
+
#
|
17
|
+
# Брюс Стерлинг. Схизматрица
|
18
|
+
# repub -x 'title://h2' -x 'toc:table' -x 'toc_item://a' -X 'div,table,//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
|
19
|
+
#
|
20
|
+
# Git User's Manual
|
21
|
+
# repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
|
22
|
+
|
23
|
+
|
24
|
+
Repub::App.instance.run ARGV
|
data/lib/repub.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
module Repub
|
2
|
+
|
3
|
+
# :stopdoc:
|
4
|
+
VERSION = '0.2.1'
|
5
|
+
LIBPATH = File.expand_path(File.dirname(__FILE__)) + File::SEPARATOR
|
6
|
+
PATH = File.dirname(LIBPATH) + File::SEPARATOR
|
7
|
+
# :startdoc:
|
8
|
+
|
9
|
+
# Returns the version string for the library.
|
10
|
+
#
|
11
|
+
def self.version
|
12
|
+
VERSION
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the library path for the module. If any arguments are given,
|
16
|
+
# they will be joined to the end of the libray path using
|
17
|
+
# <tt>File.join</tt>.
|
18
|
+
#
|
19
|
+
def self.libpath( *args )
|
20
|
+
args.empty? ? LIBPATH : File.join(LIBPATH, args.flatten)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the lpath for the module. If any arguments are given,
|
24
|
+
# they will be joined to the end of the path using
|
25
|
+
# <tt>File.join</tt>.
|
26
|
+
#
|
27
|
+
def self.path( *args )
|
28
|
+
args.empty? ? PATH : File.join(PATH, args.flatten)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Utility method used to require all files ending in .rb that lie in the
|
32
|
+
# directory below this file that has the same name as the filename passed
|
33
|
+
# in. Optionally, a specific _directory_ name can be passed in such that
|
34
|
+
# the _filename_ does not have to be equivalent to the directory.
|
35
|
+
#
|
36
|
+
def self.require_all_libs_relative_to( fname, dir = nil )
|
37
|
+
dir ||= File.basename(fname, '.*')
|
38
|
+
search_me = File.expand_path(
|
39
|
+
File.join(File.dirname(fname), dir, '**', '*.rb'))
|
40
|
+
|
41
|
+
Dir.glob(search_me).each {|rb| p rb; require rb}
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
$:.unshift Repub.libpath
|
data/lib/repub/app.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'launchy'
|
4
|
+
require 'repub/app/utility'
|
5
|
+
require 'repub/app/options'
|
6
|
+
require 'repub/app/profile'
|
7
|
+
require 'repub/app/logger'
|
8
|
+
require 'repub/app/fetcher'
|
9
|
+
require 'repub/app/parser'
|
10
|
+
require 'repub/app/builder'
|
11
|
+
|
12
|
+
module Repub
|
13
|
+
class App
|
14
|
+
include Singleton
|
15
|
+
|
16
|
+
# Mix-in actual functionality
|
17
|
+
include Options, Profile, Fetcher, Parser, Builder, Logger
|
18
|
+
|
19
|
+
def self.name
|
20
|
+
File.basename($0)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.data_path
|
24
|
+
File.join(File.expand_path('~'), '.repub')
|
25
|
+
end
|
26
|
+
|
27
|
+
def run(args)
|
28
|
+
parse_options(args)
|
29
|
+
|
30
|
+
log.level = options[:verbosity]
|
31
|
+
log.info "Making ePub from #{options[:url]}"
|
32
|
+
res = build(parse(fetch))
|
33
|
+
log.info "Saved #{res.output_path}"
|
34
|
+
|
35
|
+
Launchy::Browser.run(res.asset_path) if options[:browser]
|
36
|
+
|
37
|
+
rescue RuntimeError => ex
|
38
|
+
log.fatal "** ERROR: #{ex.to_s}"
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'tmpdir'
|
3
|
+
require 'repub/epub'
|
4
|
+
|
5
|
+
module Repub
|
6
|
+
class App
|
7
|
+
module Builder
|
8
|
+
|
9
|
+
class BuilderException < RuntimeError; end
|
10
|
+
|
11
|
+
def build(parser)
|
12
|
+
Builder.new(options).build(parser)
|
13
|
+
end
|
14
|
+
|
15
|
+
class Builder
|
16
|
+
include Epub, Logger
|
17
|
+
|
18
|
+
attr_reader :output_path
|
19
|
+
attr_reader :asset_path
|
20
|
+
|
21
|
+
def initialize(options)
|
22
|
+
@options = options
|
23
|
+
end
|
24
|
+
|
25
|
+
def build(parser)
|
26
|
+
@parser = parser
|
27
|
+
|
28
|
+
# Initialize content.opf
|
29
|
+
@content = Content.new(@parser.uid)
|
30
|
+
# Default title is the parsed one
|
31
|
+
@content.metadata.title = @parser.title
|
32
|
+
# Override metadata values specified in options
|
33
|
+
if @options[:metadata]
|
34
|
+
@content.metadata.members.each do |m|
|
35
|
+
m = m.to_sym
|
36
|
+
next if m == :identifier # do not allow to override uid
|
37
|
+
if @options[:metadata][m]
|
38
|
+
@content.metadata[m] = @options[:metadata][m]
|
39
|
+
log.debug "-- Setting metadata #{m} to \"#{@content.metadata[m]}\""
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Initialize toc.ncx
|
45
|
+
@toc = Toc.new(@parser.uid)
|
46
|
+
# TOC title is the same as in content.opf
|
47
|
+
@toc.title = @content.metadata.title
|
48
|
+
|
49
|
+
# Setup output filename and path
|
50
|
+
@output_path = File.expand_path(@options[:output_path].if_blank('.'))
|
51
|
+
if File.exist?(@output_path) && File.directory?(@output_path)
|
52
|
+
@output_path = File.join(@output_path, @content.metadata.title.gsub(/\s/, '_'))
|
53
|
+
end
|
54
|
+
@output_path = @output_path + '.epub'
|
55
|
+
log.debug "-- Setting output path to #{@output_path}"
|
56
|
+
|
57
|
+
# Build EPUB
|
58
|
+
tmpdir = Dir.mktmpdir(App::name)
|
59
|
+
begin
|
60
|
+
FileUtils.chdir(tmpdir) do
|
61
|
+
copy_and_process_assets
|
62
|
+
write_meta_inf
|
63
|
+
write_mime_type
|
64
|
+
write_content
|
65
|
+
write_toc
|
66
|
+
write_epub
|
67
|
+
end
|
68
|
+
ensure
|
69
|
+
# Keep tmp folder if we're going open processed doc in browser
|
70
|
+
FileUtils.remove_entry_secure(tmpdir) unless @options[:browser]
|
71
|
+
end
|
72
|
+
self
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
MetaInf = 'META-INF'
|
78
|
+
|
79
|
+
def postprocess_file(asset)
|
80
|
+
source = IO.read(asset)
|
81
|
+
# Do rx substitutions
|
82
|
+
if @options[:rx] && !@options[:rx].empty?
|
83
|
+
@options[:rx].each do |rx|
|
84
|
+
rx.strip!
|
85
|
+
delimiter = rx[0, 1]
|
86
|
+
rx = rx.gsub(/\\#{delimiter}/, "\n")
|
87
|
+
ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
|
88
|
+
raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
|
89
|
+
pattern = ra[0]
|
90
|
+
replacement = ra[1] || ''
|
91
|
+
log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
|
92
|
+
source.gsub!(Regexp.new(pattern), replacement)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
# Add doctype if missing
|
96
|
+
if source !~ /\s*<!DOCTYPE/
|
97
|
+
log.debug "-- Adding missing doctype"
|
98
|
+
source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
|
99
|
+
end
|
100
|
+
# Overwrite asset with fixed version
|
101
|
+
File.open(asset, 'w') do |f|
|
102
|
+
f.write(source)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def postprocess_doc(asset)
|
107
|
+
# Do Hpricot magic if fixup is ON
|
108
|
+
doc = Hpricot(open(asset), :xhtml_strict => @options[:fixup])
|
109
|
+
# Substitute custom stylesheet
|
110
|
+
if (@options[:css] && !@options[:css].empty?)
|
111
|
+
doc.search('//link[@rel="stylesheet"]') do |link|
|
112
|
+
link[:href] = File.basename(@options[:css])
|
113
|
+
log.debug "-- Replacing CSS refs with #{link[:href]}"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
# Remove elements
|
117
|
+
if @options[:remove] && !@options[:remove].empty?
|
118
|
+
@options[:remove].each do |selector|
|
119
|
+
log.info "Removing element(s) matching selector \"#{selector}\""
|
120
|
+
doc.search(selector).remove
|
121
|
+
end
|
122
|
+
end
|
123
|
+
# Overwrite asset with fixed version
|
124
|
+
File.open(asset, 'w') do |f|
|
125
|
+
f << doc.to_html
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def copy_and_process_assets
|
130
|
+
# Copy html
|
131
|
+
@parser.cache.assets[:documents].each do |asset|
|
132
|
+
log.debug "-- Processing document #{asset}"
|
133
|
+
# Copy asset from cache
|
134
|
+
FileUtils.cp(File.join(@parser.cache.path, asset), '.')
|
135
|
+
# Do post-processing
|
136
|
+
postprocess_file(asset)
|
137
|
+
postprocess_doc(asset)
|
138
|
+
@content.add_document(asset)
|
139
|
+
@asset_path = File.expand_path(asset)
|
140
|
+
end
|
141
|
+
# Copy css
|
142
|
+
if @options[:css].nil? || @options[:css].empty?
|
143
|
+
# No custom css, copy one from assets
|
144
|
+
@parser.cache.assets[:stylesheets].each do |css|
|
145
|
+
log.debug "-- Copying stylesheet #{css}"
|
146
|
+
FileUtils.cp(File.join(@parser.cache.path, css), '.')
|
147
|
+
@content.add_stylesheet(css)
|
148
|
+
end
|
149
|
+
else
|
150
|
+
# Copy custom css
|
151
|
+
log.debug "-- Using custom stylesheet #{@options[:css]}"
|
152
|
+
FileUtils.cp(@options[:css], '.')
|
153
|
+
@content.add_stylesheet(File.basename(@options[:css]))
|
154
|
+
end
|
155
|
+
# Copy images
|
156
|
+
@parser.cache.assets[:images].each do |image|
|
157
|
+
log.debug "-- Copying image #{image}"
|
158
|
+
FileUtils.cp(File.join(@parser.cache.path, image), '.')
|
159
|
+
@content.add_image(image)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def write_meta_inf
|
164
|
+
FileUtils.mkdir_p(MetaInf)
|
165
|
+
FileUtils.chdir(MetaInf) do
|
166
|
+
Epub::Container.new.save
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def write_mime_type
|
171
|
+
File.open('mimetype', 'w') do |f|
|
172
|
+
f << 'application/epub+zip'
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def write_content
|
177
|
+
@content.save
|
178
|
+
end
|
179
|
+
|
180
|
+
def write_toc
|
181
|
+
add_nav_points(@toc.nav_map, @parser.toc)
|
182
|
+
@toc.save
|
183
|
+
end
|
184
|
+
|
185
|
+
def add_nav_points(nav_collection, toc)
|
186
|
+
toc.each do |t|
|
187
|
+
nav_point = nav_collection.add_nav_point(t.title, t.src)
|
188
|
+
add_nav_points(nav_point, t.subitems) if t.subitems
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def write_epub
|
193
|
+
%x(zip -X9 \"#{@output_path}\" mimetype)
|
194
|
+
%x(zip -Xr9D \"#{@output_path}\" * -xi mimetype)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|