EPUBChop 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +21 -0
- data/.travis.yml +5 -0
- data/EPUBChop.gemspec +27 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +34 -0
- data/Rakefile +7 -0
- data/bin/epubchop +44 -0
- data/lib/EPUBChop/chop.rb +209 -0
- data/lib/EPUBChop/version.rb +3 -0
- data/lib/EPUBChop.rb +9 -0
- data/lib/trollop.rb +783 -0
- data/spec/Verne_20000_West_pg11393.epub +0 -0
- data/spec/epubchop_spec.rb +33 -0
- data/spec/spec_helper.rb +13 -0
- metadata +148 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
NTk5MTRjMjQ1ZDk2YTEzM2E3MWNkN2ViODEzYzdlYWQ4ODE0YWE4NA==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YzYyMTA1YmYzOWY1N2UwNWRlM2ZiOTIwMmU0NTRhNGQ2YjE3YTc3Nw==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MjI4ZDgzNWY0NzI3NmQ4YzAzNzNlNTZkMGMzZDA4MjIwMmVhOTQ4Y2EwODI5
|
10
|
+
ZWZjNDNkNDRhOWI4N2YxYWMxZjc5MDU2MjFkMTIzYWQ2MTk5YmI2YTczZjEx
|
11
|
+
ZWFjMWY4YTgwN2FkYjJiOGNlYTJhNTk5ZGY3N2VlZDE5MGU0NWM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YWYyZDNiMDFkOGExMWY5MGFiM2Y4MDdkYWQ3MWNlMDkxMTQ2MDkzNzIxOWE1
|
14
|
+
Mzg3ZDBhNGVmMGJiZmQ0ODgzMmQzYmFkZWU2ZmNhYmZkMjUxMzc0NDQzNzE1
|
15
|
+
YWNhOGZhMTVmMzhiODFhMTY1ODAzNGE2MDI5YmE5MzI4MGI3ZmI=
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/EPUBChop.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'EPUBChop/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "EPUBChop"
|
8
|
+
spec.version = EPUBChop::VERSION
|
9
|
+
spec.authors = ["Mehmet Celik"]
|
10
|
+
spec.email = ["mehmet@celik.be"]
|
11
|
+
spec.description = %q{Create EPUB previews}
|
12
|
+
spec.summary = %q{Removes unwanted content from an EPUB}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_runtime_dependency "epubinfo_with_toc"
|
25
|
+
spec.add_runtime_dependency "rubyzip", "~> 1.0"
|
26
|
+
spec.add_runtime_dependency "nokogiri"
|
27
|
+
end
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 Mehmet Celik
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
EPUBChop [![Continuous Integration](https://travis-ci.org/mehmetc/EPUBChop.png?branch=master)](http://travis-ci.org/mehmetc/EPUBChop)
|
2
|
+
========
|
3
|
+
|
4
|
+
Creates EPUB previews
|
5
|
+
|
6
|
+
```
|
7
|
+
$ ./bin/epubchop --help
|
8
|
+
EPUBChop will create a preview version of an EPUB file.
|
9
|
+
|
10
|
+
Usage:
|
11
|
+
epubchop [options] <filename>
|
12
|
+
|
13
|
+
where [options] are:
|
14
|
+
--words, -w <i>: the amount of words to put in the preview (default: 10)
|
15
|
+
--base, -b <s>: if given the base value of the amount of words is ... Possible values percentage (default: percentage)
|
16
|
+
--line1, -l <s>: Text that is shown on line 1 of the chopped of pages (default: Continue reading?)
|
17
|
+
--line2, -i <s>: Text that is shown on line 2 of the chopped of pages (default: Go to your local library or buy the book.)
|
18
|
+
--help, -h: Show this message
|
19
|
+
```
|
20
|
+
|
21
|
+
### Example:
|
22
|
+
```ruby
|
23
|
+
epubchop --words 10 --base percentage -line1 "Want to read more?" -line2 "Buy the book!" my.epub
|
24
|
+
```
|
25
|
+
|
26
|
+
## Contributing to EPUBChop
|
27
|
+
* Fork the project.
|
28
|
+
* Create a new branch to implement your bugfixes or features
|
29
|
+
* Commit and push until you are happy with your contribution.
|
30
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
31
|
+
|
32
|
+
## Copyright
|
33
|
+
|
34
|
+
Copyright (c) 2013 LIBIS/KULeuven, Mehmet Celik. See LICENSE for further details.
|
data/Rakefile
ADDED
data/bin/epubchop
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$LOAD_PATH << './lib'
|
3
|
+
require 'EPUBChop'
|
4
|
+
require 'trollop'
|
5
|
+
|
6
|
+
options = Trollop::options do
|
7
|
+
version = "EPUBChop #{EPUBChop::VERSION} (c) 2013 LIBIS/KULeuven, Mehmet Celik"
|
8
|
+
banner <<-BANNER
|
9
|
+
EPUBChop will create a preview version of an EPUB file.
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
epubchop [options] <filename>
|
13
|
+
|
14
|
+
where [options] are:
|
15
|
+
BANNER
|
16
|
+
|
17
|
+
opt :words, "the amount of words to put in the preview", :type => :int, :default => 10
|
18
|
+
opt :base, "How to interprete the --words options... Possible value: percentage", :type => :string, :default => 'percentage'
|
19
|
+
opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
|
20
|
+
opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
|
21
|
+
end
|
22
|
+
|
23
|
+
Trollop::die "need an EPUB file name" if ARGV.empty?
|
24
|
+
|
25
|
+
|
26
|
+
begin
|
27
|
+
filename = File.expand_path(ARGV[0])
|
28
|
+
words = options[:words]
|
29
|
+
base = options[:base]
|
30
|
+
text = []
|
31
|
+
text << options[:line1] if options.has_key?(:line1)
|
32
|
+
text << options[:line2] if options.has_key?(:line2)
|
33
|
+
|
34
|
+
puts 'loading EPUB'
|
35
|
+
b=EPUBChop.get(filename)
|
36
|
+
puts 'chopping EPUB'
|
37
|
+
c=b.chop({:base => base.to_s, :words => words, :text => text})
|
38
|
+
puts 'rebuilding EPUB'
|
39
|
+
FileUtils.move(c, "chopped_#{File.basename(filename)}")
|
40
|
+
|
41
|
+
rescue Exception => e
|
42
|
+
puts "An error occured\n#{e.message}"
|
43
|
+
exit 1
|
44
|
+
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'epubinfo'
|
3
|
+
require 'tempfile'
|
4
|
+
require 'zip'
|
5
|
+
|
6
|
+
module EPUBChop
|
7
|
+
class Chop
|
8
|
+
attr_reader :book, :words, :base, :resource_word_count, :resource_allowed_word_count, :text1, :text2
|
9
|
+
|
10
|
+
def initialize(input, options ={})
|
11
|
+
set_defaults(options)
|
12
|
+
|
13
|
+
|
14
|
+
raise 'Please supply an input file name' if input.nil?
|
15
|
+
|
16
|
+
#count the number of words in a file
|
17
|
+
@resource_word_count = count_words(input)
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def total_words
|
22
|
+
@resource_word_count.values.inject(0) { |sum, i| sum + i }
|
23
|
+
end
|
24
|
+
|
25
|
+
def resource_allowed_word_count
|
26
|
+
#figure out what to return
|
27
|
+
@resource_allowed_word_count ||= files_allowed(allowed_words(@words, @base))
|
28
|
+
end
|
29
|
+
|
30
|
+
def chop(options = {})
|
31
|
+
set_defaults(options)
|
32
|
+
|
33
|
+
original_zip_file = @book.table_of_contents.parser.zip_file
|
34
|
+
#unzip in temp dir
|
35
|
+
extract_dir = Dir.mktmpdir('epub_extract')
|
36
|
+
original_zip_file.entries.each do |e|
|
37
|
+
file_dir = File.split(e.name)[0]
|
38
|
+
Dir.mkdir(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?(".")
|
39
|
+
original_zip_file.extract(e, File.join(extract_dir, e.name))
|
40
|
+
end
|
41
|
+
|
42
|
+
#fix spine files
|
43
|
+
filename_list = @resource_word_count.keys
|
44
|
+
filename_list.each do |filename|
|
45
|
+
original_file_size = @resource_word_count[filename]
|
46
|
+
processed_file_size = resource_allowed_word_count[filename]
|
47
|
+
|
48
|
+
if original_file_size != processed_file_size
|
49
|
+
if processed_file_size == 0
|
50
|
+
FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
|
51
|
+
FileUtils.touch "#{extract_dir}/#{filename}"
|
52
|
+
File.open("#{extract_dir}/#{filename}", 'w') do |f|
|
53
|
+
f.puts empty_file
|
54
|
+
end
|
55
|
+
|
56
|
+
else
|
57
|
+
resource = Nokogiri::XML(@book.table_of_contents.resources[filename]) do |config|
|
58
|
+
config.noblanks.nonet
|
59
|
+
end
|
60
|
+
resource.css('script').remove
|
61
|
+
resource.css('style').remove
|
62
|
+
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
63
|
+
resource_text_length = resource_text.length
|
64
|
+
|
65
|
+
# get a string that can be found
|
66
|
+
data = nil
|
67
|
+
window_begin = 5
|
68
|
+
window_end = 0
|
69
|
+
while data.nil?
|
70
|
+
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)].join(' ')
|
71
|
+
data = resource.at_css("p:contains('#{look_for}')")
|
72
|
+
window_begin += 1
|
73
|
+
window_end += 1
|
74
|
+
end
|
75
|
+
|
76
|
+
#limit on found string
|
77
|
+
if data
|
78
|
+
next_data = data.next_element
|
79
|
+
while next_data
|
80
|
+
in_resource = resource.css(next_data.css_path)
|
81
|
+
in_resource.remove
|
82
|
+
|
83
|
+
next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
#persist page
|
88
|
+
File.open("#{extract_dir}/#{filename}", 'w') do |f|
|
89
|
+
f.puts resource.to_xml(:save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
#TODO:remove unwanted media
|
96
|
+
|
97
|
+
#zip new ebook
|
98
|
+
new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
|
99
|
+
new_ebook_name_path = new_ebook_name.path
|
100
|
+
new_ebook_name_path.gsub!('-', '')
|
101
|
+
|
102
|
+
zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)
|
103
|
+
|
104
|
+
Dir[File.join(extract_dir, '**', '**')].each do |file|
|
105
|
+
zipfile.add(file.sub("#{extract_dir}/", ''), file)
|
106
|
+
end
|
107
|
+
zipfile.close
|
108
|
+
|
109
|
+
return new_ebook_name_path
|
110
|
+
rescue Zip::ZipError => e
|
111
|
+
raise RuntimeError, ''
|
112
|
+
rescue Exception => e
|
113
|
+
puts "Chopping went wrong. #{e.message}"
|
114
|
+
puts e.backtrace
|
115
|
+
|
116
|
+
return nil
|
117
|
+
ensure
|
118
|
+
FileUtils.remove_entry_secure(extract_dir)
|
119
|
+
end
|
120
|
+
|
121
|
+
private
|
122
|
+
|
123
|
+
def set_defaults(options)
|
124
|
+
@words = options[:words] || 10
|
125
|
+
@base = options[:base] || :percentage
|
126
|
+
if options[:text].is_a?(Array)
|
127
|
+
@text1 = options[:text][0] || 'Continue reading?'
|
128
|
+
@text2 = options[:text][1] || 'Go to your local library or buy the book.'
|
129
|
+
else
|
130
|
+
@text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
|
131
|
+
@text2 = ''
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def empty_file
|
136
|
+
data = <<DATA
|
137
|
+
<?xml version="1.0" encoding="utf-8" standalone="no"?>
|
138
|
+
<!DOCTYPE html>
|
139
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
140
|
+
<head>
|
141
|
+
<title>Read more</title>
|
142
|
+
</head>
|
143
|
+
<body>
|
144
|
+
<center>
|
145
|
+
<div style='width:100%;border:1px solid black;margin-top:20px;padding:5px'>
|
146
|
+
<div><h2>#{@text1}</h2></div>
|
147
|
+
<div><h2>#{@text2}</h2></div>
|
148
|
+
</div>
|
149
|
+
</center>
|
150
|
+
</body>
|
151
|
+
</html>
|
152
|
+
DATA
|
153
|
+
end
|
154
|
+
|
155
|
+
def count_words(input)
|
156
|
+
@book = EPUBInfo.get(input)
|
157
|
+
resource_word_count = {}
|
158
|
+
if @book
|
159
|
+
@book.table_of_contents.resources.spine.each do |resource|
|
160
|
+
raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
|
161
|
+
config.noblanks.nonet
|
162
|
+
end
|
163
|
+
raw.css('script').remove
|
164
|
+
raw.css('style').remove
|
165
|
+
size = raw.at_css('body').text.split.size
|
166
|
+
resource_word_count.store(resource[:uri], size)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
# resource_word_count.values.inject(0){|sum, i| sum + i}
|
170
|
+
resource_word_count
|
171
|
+
end
|
172
|
+
|
173
|
+
def allowed_words(words, base)
|
174
|
+
@allowed_words ||= begin
|
175
|
+
case base.to_s
|
176
|
+
when 'percentage'
|
177
|
+
@allowed_words = (total_words * (words / 100.0)).to_i
|
178
|
+
else
|
179
|
+
@allowed_words = words
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
|
185
|
+
def files_allowed(allowed_words)
|
186
|
+
word_counter = 0
|
187
|
+
resource_allowed_word_count = @resource_word_count.select do |r|
|
188
|
+
(word_counter += @resource_word_count[r]) < allowed_words
|
189
|
+
end
|
190
|
+
word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }
|
191
|
+
|
192
|
+
how_many_words_left = allowed_words - word_counter
|
193
|
+
if how_many_words_left > 0
|
194
|
+
resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]
|
195
|
+
word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
|
196
|
+
if how_many_words_left < word_count_of_resource_to_split
|
197
|
+
resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
@resource_word_count.keys[resource_allowed_word_count.length..@resource_word_count.length].each do |r|
|
202
|
+
resource_allowed_word_count.store(r, 0)
|
203
|
+
end
|
204
|
+
|
205
|
+
resource_allowed_word_count
|
206
|
+
end
|
207
|
+
|
208
|
+
end
|
209
|
+
end
|