EPUBChop 0.0.1 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/EPUBChop.gemspec +1 -1
- data/README.md +4 -0
- data/bin/epubchop +3 -1
- data/lib/EPUBChop/chop.rb +142 -34
- data/lib/EPUBChop/version.rb +1 -1
- data/spec/epub/default.epub +0 -0
- data/spec/epubchop_spec.rb +1 -1
- metadata +7 -6
- /data/spec/{Verne_20000_West_pg11393.epub → epub/Verne_20000_West_pg11393.epub} +0 -0
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmM1ZTY5M2E0NjMwN2ViZDFkYzUzODAyZmZhN2VmOGVkZTkwNTNkYQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
N2IwZThjYjA2Yjg3YjRhZDBiOTliMDY5Y2FmMzJmMjc4YTI3NGI3MA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MTg1YWMwYmU5NzI1ZTEwZTQxNzJlM2M0YzU3MGY2ZGZjZTE4NDk1ZmJmMDkw
|
10
|
+
OTRkN2M0OTQ3ZDQxYThhMjRlN2ZhZjA5ZGExODIyNTg1NDczNTQ4MWM0MjU4
|
11
|
+
MzdmNDMyMWNiMDNhNzQ4ZmQ3NDY3ODgxNzQ2MjQ2ZDI5MmI1MGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NTM0NzdkZDNmY2E3MzkwOGU0ODQxZjA5YmQ4ZWRjNDM1N2JlYzhkY2Q5YTcy
|
14
|
+
MDZiMjUyZDZmNzY4NDE1YzJhZTA1NzY0MjUzOWQ1ZDc3ZmQ3N2FkMzBjYjZm
|
15
|
+
NzQ3NGM2ZDUyODU0ZmEwNTA4OTQ4NGUwYTJlNDgzNmJlNDUzYjg=
|
data/EPUBChop.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["mehmet@celik.be"]
|
11
11
|
spec.description = %q{Create EPUB previews}
|
12
12
|
spec.summary = %q{Removes unwanted content from an EPUB}
|
13
|
-
spec.homepage = ""
|
13
|
+
spec.homepage = "https://github.com/mehmetc/EPUBChop"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
data/README.md
CHANGED
@@ -19,10 +19,14 @@ where [options] are:
|
|
19
19
|
```
|
20
20
|
|
21
21
|
### Example:
|
22
|
+
Create a new EPUB with 10% of the content all other pages should contain the lines "Want to read more? Buy the book!"
|
22
23
|
```ruby
|
23
24
|
epubchop --words 10 --base percentage -line1 "Want to read more?" -line2 "Buy the book!" my.epub
|
24
25
|
```
|
25
26
|
|
27
|
+
This gem depends on [![epubinfo](http://github.com/chdorner/epubinfo)] I made some additions to the gem but they are still in a branch. Until they get accepted I'll be using the [![epubinfo_with_toc](https://github.com/mehmetc/epubinfo/tree/table_of_contents)]
|
28
|
+
gem.
|
29
|
+
|
26
30
|
## Contributing to EPUBChop
|
27
31
|
* Fork the project.
|
28
32
|
* Create a new branch to implement your bugfixes or features
|
data/bin/epubchop
CHANGED
@@ -18,6 +18,7 @@ BANNER
|
|
18
18
|
opt :base, "How to interprete the --words options... Possible value: percentage", :type => :string, :default => 'percentage'
|
19
19
|
opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
|
20
20
|
opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
|
21
|
+
opt :chop_by, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => :spine
|
21
22
|
end
|
22
23
|
|
23
24
|
Trollop::die "need an EPUB file name" if ARGV.empty?
|
@@ -30,11 +31,12 @@ begin
|
|
30
31
|
text = []
|
31
32
|
text << options[:line1] if options.has_key?(:line1)
|
32
33
|
text << options[:line2] if options.has_key?(:line2)
|
34
|
+
chop_by << options[:chop_by]
|
33
35
|
|
34
36
|
puts 'loading EPUB'
|
35
37
|
b=EPUBChop.get(filename)
|
36
38
|
puts 'chopping EPUB'
|
37
|
-
c=b.chop({:base => base.to_s, :words => words, :text => text})
|
39
|
+
c=b.chop({:base => base.to_s, :words => words, :text => text, :chop_by => chop_by})
|
38
40
|
puts 'rebuilding EPUB'
|
39
41
|
FileUtils.move(c, "chopped_#{File.basename(filename)}")
|
40
42
|
|
data/lib/EPUBChop/chop.rb
CHANGED
@@ -10,7 +10,6 @@ module EPUBChop
|
|
10
10
|
def initialize(input, options ={})
|
11
11
|
set_defaults(options)
|
12
12
|
|
13
|
-
|
14
13
|
raise 'Please supply an input file name' if input.nil?
|
15
14
|
|
16
15
|
#count the number of words in a file
|
@@ -31,14 +30,40 @@ module EPUBChop
|
|
31
30
|
set_defaults(options)
|
32
31
|
|
33
32
|
original_zip_file = @book.table_of_contents.parser.zip_file
|
33
|
+
extract_dir = extract_epub_to_tmp_dir(original_zip_file)
|
34
|
+
|
35
|
+
chop_files_in_tmp_dir(extract_dir)
|
36
|
+
remove_unused_media_from_tmp_dir(extract_dir)
|
37
|
+
|
38
|
+
|
39
|
+
return rebuild_epub_from_tmp_dir(extract_dir)
|
40
|
+
rescue Zip::ZipError => e
|
41
|
+
raise RuntimeError, "Error processing EPUB. #{e.message}"
|
42
|
+
rescue Exception => e
|
43
|
+
puts "Chopping went wrong. #{e.message}"
|
44
|
+
puts e.backtrace
|
45
|
+
|
46
|
+
return nil
|
47
|
+
ensure
|
48
|
+
FileUtils.remove_entry_secure(extract_dir)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def extract_epub_to_tmp_dir(original_zip_file)
|
34
54
|
#unzip in temp dir
|
35
55
|
extract_dir = Dir.mktmpdir('epub_extract')
|
36
56
|
original_zip_file.entries.each do |e|
|
37
57
|
file_dir = File.split(e.name)[0]
|
38
|
-
|
58
|
+
FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.')
|
39
59
|
original_zip_file.extract(e, File.join(extract_dir, e.name))
|
40
60
|
end
|
41
61
|
|
62
|
+
extract_dir
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def chop_files_in_tmp_dir(extract_dir)
|
42
67
|
#fix spine files
|
43
68
|
filename_list = @resource_word_count.keys
|
44
69
|
filename_list.each do |filename|
|
@@ -50,27 +75,39 @@ module EPUBChop
|
|
50
75
|
FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
|
51
76
|
FileUtils.touch "#{extract_dir}/#{filename}"
|
52
77
|
File.open("#{extract_dir}/#{filename}", 'w') do |f|
|
53
|
-
f.puts
|
78
|
+
f.puts empty_file_with_cover(filename)
|
54
79
|
end
|
55
80
|
|
56
81
|
else
|
82
|
+
#noinspection RubyResolve
|
57
83
|
resource = Nokogiri::XML(@book.table_of_contents.resources[filename]) do |config|
|
58
84
|
config.noblanks.nonet
|
59
85
|
end
|
60
86
|
resource.css('script').remove
|
61
87
|
resource.css('style').remove
|
62
88
|
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
63
|
-
resource_text_length = resource_text.length
|
89
|
+
#resource_text_length = resource_text.length
|
64
90
|
|
65
91
|
# get a string that can be found
|
66
92
|
data = nil
|
67
|
-
window_begin = 5
|
93
|
+
window_begin = default_window_begin = 5
|
68
94
|
window_end = 0
|
69
95
|
while data.nil?
|
70
|
-
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
|
71
|
-
|
72
|
-
|
73
|
-
|
96
|
+
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
|
97
|
+
|
98
|
+
if look_for.nil?
|
99
|
+
window_begin = default_window_begin += 5
|
100
|
+
window_end = 0
|
101
|
+
else
|
102
|
+
data = resource.at_css("*:contains('#{look_for.join(' ')}')")
|
103
|
+
window_begin -= 1
|
104
|
+
window_end += 1
|
105
|
+
|
106
|
+
if window_begin == window_end
|
107
|
+
window_begin = default_window_begin += 5
|
108
|
+
window_end = 0
|
109
|
+
end
|
110
|
+
end
|
74
111
|
end
|
75
112
|
|
76
113
|
#limit on found string
|
@@ -92,8 +129,9 @@ module EPUBChop
|
|
92
129
|
end
|
93
130
|
end
|
94
131
|
end
|
95
|
-
|
132
|
+
end
|
96
133
|
|
134
|
+
def rebuild_epub_from_tmp_dir(extract_dir)
|
97
135
|
#zip new ebook
|
98
136
|
new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
|
99
137
|
new_ebook_name_path = new_ebook_name.path
|
@@ -101,24 +139,56 @@ module EPUBChop
|
|
101
139
|
|
102
140
|
zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)
|
103
141
|
|
104
|
-
Dir[File.join(extract_dir, '**', '**')]
|
142
|
+
epub_files = Dir[File.join(extract_dir, '**', '**')]
|
143
|
+
|
144
|
+
#minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
|
145
|
+
mimetype = epub_files.delete("#{extract_dir}/mimetype")
|
146
|
+
mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
|
147
|
+
zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
|
148
|
+
|
149
|
+
#all the other files
|
150
|
+
epub_files.each do |file|
|
105
151
|
zipfile.add(file.sub("#{extract_dir}/", ''), file)
|
106
152
|
end
|
107
153
|
zipfile.close
|
108
154
|
|
109
|
-
|
110
|
-
|
111
|
-
raise RuntimeError, ''
|
112
|
-
rescue Exception => e
|
113
|
-
puts "Chopping went wrong. #{e.message}"
|
114
|
-
puts e.backtrace
|
155
|
+
new_ebook_name_path
|
156
|
+
end
|
115
157
|
|
116
|
-
|
117
|
-
|
118
|
-
|
158
|
+
#noinspection RubyInstanceMethodNamingConvention
|
159
|
+
def remove_unused_media_from_tmp_dir(extract_dir)
|
160
|
+
#TODO: remove other media
|
161
|
+
#TODO: rebuild toc.ncx and content.opf
|
162
|
+
remove_unused_images_from_tmp_dir(extract_dir)
|
163
|
+
end
|
164
|
+
|
165
|
+
#noinspection RubyInstanceMethodNamingConvention
|
166
|
+
def remove_unused_images_from_tmp_dir(extract_dir)
|
167
|
+
puts 'removing unused media'
|
168
|
+
not_to_be_deleted_images = []
|
169
|
+
all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
|
170
|
+
@book.table_of_contents.resources.html.each do |resource|
|
171
|
+
file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
|
172
|
+
|
173
|
+
all_images.each do |image|
|
174
|
+
i = image.split('/').last
|
175
|
+
data = file.at_css("img[src$='#{i}']")
|
176
|
+
|
177
|
+
if data
|
178
|
+
not_to_be_deleted_images << image
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
to_be_deleted_images = (all_images - not_to_be_deleted_images)
|
184
|
+
to_be_deleted_images.each do |image|
|
185
|
+
puts "\t\tremoving #{image}"
|
186
|
+
File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
|
187
|
+
end
|
188
|
+
|
189
|
+
to_be_deleted_images
|
119
190
|
end
|
120
191
|
|
121
|
-
private
|
122
192
|
|
123
193
|
def set_defaults(options)
|
124
194
|
@words = options[:words] || 10
|
@@ -130,34 +200,69 @@ module EPUBChop
|
|
130
200
|
@text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
|
131
201
|
@text2 = ''
|
132
202
|
end
|
203
|
+
|
204
|
+
@chop_by = options[:chop_by] || :spine
|
133
205
|
end
|
134
206
|
|
135
|
-
def
|
207
|
+
def empty_file_with_cover(filename)
|
208
|
+
number_of_subdirectories = filename.split('/').size - 1
|
209
|
+
|
210
|
+
cover_path = ''
|
211
|
+
number_of_subdirectories.times{ cover_path += '../'}
|
212
|
+
|
213
|
+
cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
|
214
|
+
|
136
215
|
data = <<DATA
|
137
216
|
<?xml version="1.0" encoding="utf-8" standalone="no"?>
|
138
|
-
<!DOCTYPE html>
|
217
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
139
218
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
140
|
-
<head>
|
141
|
-
<title>Read more</title>
|
142
|
-
</head>
|
143
|
-
|
144
|
-
<
|
145
|
-
<div style=
|
146
|
-
<div
|
147
|
-
<
|
148
|
-
</
|
149
|
-
</
|
219
|
+
<head>
|
220
|
+
<title>Read more</title>
|
221
|
+
</head>
|
222
|
+
|
223
|
+
<body>
|
224
|
+
<div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;">
|
225
|
+
<div style='text-align:center;'>
|
226
|
+
<h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2>
|
227
|
+
<span>#{CGI.escape_html(@text2 ? @text2 : '')}</span>
|
228
|
+
</div>
|
229
|
+
|
230
|
+
<div style="margin-top:20px;">
|
231
|
+
<div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;">
|
232
|
+
<img src="#{cover_path}" alt="" style="width:100%" />
|
233
|
+
</div>
|
234
|
+
|
235
|
+
<div style='padding-top:10px;'>
|
236
|
+
<h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '' )}</h3>
|
237
|
+
</div>
|
238
|
+
|
239
|
+
<div>
|
240
|
+
<h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4>
|
241
|
+
</div>
|
242
|
+
|
243
|
+
</div>
|
244
|
+
|
245
|
+
<br />
|
246
|
+
|
247
|
+
<div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div>
|
248
|
+
</div>
|
150
249
|
</body>
|
151
250
|
</html>
|
251
|
+
|
152
252
|
DATA
|
253
|
+
|
254
|
+
data
|
153
255
|
end
|
154
256
|
|
155
257
|
def count_words(input)
|
156
258
|
@book = EPUBInfo.get(input)
|
157
259
|
resource_word_count = {}
|
158
260
|
if @book
|
159
|
-
@book.table_of_contents.resources.
|
261
|
+
chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
|
262
|
+
|
263
|
+
chop_by.each do |resource|
|
160
264
|
raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
|
265
|
+
#noinspection RubyResolve
|
161
266
|
config.noblanks.nonet
|
162
267
|
end
|
163
268
|
raw.css('script').remove
|
@@ -187,11 +292,14 @@ DATA
|
|
187
292
|
resource_allowed_word_count = @resource_word_count.select do |r|
|
188
293
|
(word_counter += @resource_word_count[r]) < allowed_words
|
189
294
|
end
|
295
|
+
|
190
296
|
word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }
|
191
297
|
|
192
298
|
how_many_words_left = allowed_words - word_counter
|
193
299
|
if how_many_words_left > 0
|
194
300
|
resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]
|
301
|
+
|
302
|
+
#noinspection RubyLocalVariableNamingConvention
|
195
303
|
word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
|
196
304
|
if how_many_words_left < word_count_of_resource_to_split
|
197
305
|
resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)
|
data/lib/EPUBChop/version.rb
CHANGED
Binary file
|
data/spec/epubchop_spec.rb
CHANGED
@@ -4,7 +4,7 @@ require 'spec_helper'
|
|
4
4
|
describe 'EPUBChop' do
|
5
5
|
before(:all) do
|
6
6
|
#chop EPUB at 10% of total words
|
7
|
-
@chop = EPUBChop.get('./spec/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
|
7
|
+
@chop = EPUBChop.get('./spec/epub/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
|
8
8
|
end
|
9
9
|
|
10
10
|
it 'load an epub' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: EPUBChop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,10 +114,11 @@ files:
|
|
114
114
|
- lib/EPUBChop/chop.rb
|
115
115
|
- lib/EPUBChop/version.rb
|
116
116
|
- lib/trollop.rb
|
117
|
-
- spec/Verne_20000_West_pg11393.epub
|
117
|
+
- spec/epub/Verne_20000_West_pg11393.epub
|
118
|
+
- spec/epub/default.epub
|
118
119
|
- spec/epubchop_spec.rb
|
119
120
|
- spec/spec_helper.rb
|
120
|
-
homepage:
|
121
|
+
homepage: https://github.com/mehmetc/EPUBChop
|
121
122
|
licenses:
|
122
123
|
- MIT
|
123
124
|
metadata: {}
|
@@ -142,7 +143,7 @@ signing_key:
|
|
142
143
|
specification_version: 4
|
143
144
|
summary: Removes unwanted content from an EPUB
|
144
145
|
test_files:
|
145
|
-
- spec/Verne_20000_West_pg11393.epub
|
146
|
+
- spec/epub/Verne_20000_West_pg11393.epub
|
147
|
+
- spec/epub/default.epub
|
146
148
|
- spec/epubchop_spec.rb
|
147
149
|
- spec/spec_helper.rb
|
148
|
-
has_rdoc:
|
File without changes
|