EPUBChop 0.0.1 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/EPUBChop.gemspec +1 -1
- data/README.md +4 -0
- data/bin/epubchop +3 -1
- data/lib/EPUBChop/chop.rb +142 -34
- data/lib/EPUBChop/version.rb +1 -1
- data/spec/epub/default.epub +0 -0
- data/spec/epubchop_spec.rb +1 -1
- metadata +7 -6
- /data/spec/{Verne_20000_West_pg11393.epub → epub/Verne_20000_West_pg11393.epub} +0 -0
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmM1ZTY5M2E0NjMwN2ViZDFkYzUzODAyZmZhN2VmOGVkZTkwNTNkYQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
N2IwZThjYjA2Yjg3YjRhZDBiOTliMDY5Y2FmMzJmMjc4YTI3NGI3MA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MTg1YWMwYmU5NzI1ZTEwZTQxNzJlM2M0YzU3MGY2ZGZjZTE4NDk1ZmJmMDkw
|
10
|
+
OTRkN2M0OTQ3ZDQxYThhMjRlN2ZhZjA5ZGExODIyNTg1NDczNTQ4MWM0MjU4
|
11
|
+
MzdmNDMyMWNiMDNhNzQ4ZmQ3NDY3ODgxNzQ2MjQ2ZDI5MmI1MGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NTM0NzdkZDNmY2E3MzkwOGU0ODQxZjA5YmQ4ZWRjNDM1N2JlYzhkY2Q5YTcy
|
14
|
+
MDZiMjUyZDZmNzY4NDE1YzJhZTA1NzY0MjUzOWQ1ZDc3ZmQ3N2FkMzBjYjZm
|
15
|
+
NzQ3NGM2ZDUyODU0ZmEwNTA4OTQ4NGUwYTJlNDgzNmJlNDUzYjg=
|
data/EPUBChop.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["mehmet@celik.be"]
|
11
11
|
spec.description = %q{Create EPUB previews}
|
12
12
|
spec.summary = %q{Removes unwanted content from an EPUB}
|
13
|
-
spec.homepage = ""
|
13
|
+
spec.homepage = "https://github.com/mehmetc/EPUBChop"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
data/README.md
CHANGED
@@ -19,10 +19,14 @@ where [options] are:
|
|
19
19
|
```
|
20
20
|
|
21
21
|
### Example:
|
22
|
+
Create a new EPUB with 10% of the content all other pages should contain the lines "Want to read more? Buy the book!"
|
22
23
|
```ruby
|
23
24
|
epubchop --words 10 --base percentage -line1 "Want to read more?" -line2 "Buy the book!" my.epub
|
24
25
|
```
|
25
26
|
|
27
|
+
This gem depends on [] I made some additions to the gem but they are still in a branch. Until they get accepted I'll be using the []
|
28
|
+
gem.
|
29
|
+
|
26
30
|
## Contributing to EPUBChop
|
27
31
|
* Fork the project.
|
28
32
|
* Create a new branch to implement your bugfixes or features
|
data/bin/epubchop
CHANGED
@@ -18,6 +18,7 @@ BANNER
|
|
18
18
|
opt :base, "How to interprete the --words options... Possible value: percentage", :type => :string, :default => 'percentage'
|
19
19
|
opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
|
20
20
|
opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
|
21
|
+
opt :chop_by, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => :spine
|
21
22
|
end
|
22
23
|
|
23
24
|
Trollop::die "need an EPUB file name" if ARGV.empty?
|
@@ -30,11 +31,12 @@ begin
|
|
30
31
|
text = []
|
31
32
|
text << options[:line1] if options.has_key?(:line1)
|
32
33
|
text << options[:line2] if options.has_key?(:line2)
|
34
|
+
chop_by << options[:chop_by]
|
33
35
|
|
34
36
|
puts 'loading EPUB'
|
35
37
|
b=EPUBChop.get(filename)
|
36
38
|
puts 'chopping EPUB'
|
37
|
-
c=b.chop({:base => base.to_s, :words => words, :text => text})
|
39
|
+
c=b.chop({:base => base.to_s, :words => words, :text => text, :chop_by => chop_by})
|
38
40
|
puts 'rebuilding EPUB'
|
39
41
|
FileUtils.move(c, "chopped_#{File.basename(filename)}")
|
40
42
|
|
data/lib/EPUBChop/chop.rb
CHANGED
@@ -10,7 +10,6 @@ module EPUBChop
|
|
10
10
|
def initialize(input, options ={})
|
11
11
|
set_defaults(options)
|
12
12
|
|
13
|
-
|
14
13
|
raise 'Please supply an input file name' if input.nil?
|
15
14
|
|
16
15
|
#count the number of words in a file
|
@@ -31,14 +30,40 @@ module EPUBChop
|
|
31
30
|
set_defaults(options)
|
32
31
|
|
33
32
|
original_zip_file = @book.table_of_contents.parser.zip_file
|
33
|
+
extract_dir = extract_epub_to_tmp_dir(original_zip_file)
|
34
|
+
|
35
|
+
chop_files_in_tmp_dir(extract_dir)
|
36
|
+
remove_unused_media_from_tmp_dir(extract_dir)
|
37
|
+
|
38
|
+
|
39
|
+
return rebuild_epub_from_tmp_dir(extract_dir)
|
40
|
+
rescue Zip::ZipError => e
|
41
|
+
raise RuntimeError, "Error processing EPUB. #{e.message}"
|
42
|
+
rescue Exception => e
|
43
|
+
puts "Chopping went wrong. #{e.message}"
|
44
|
+
puts e.backtrace
|
45
|
+
|
46
|
+
return nil
|
47
|
+
ensure
|
48
|
+
FileUtils.remove_entry_secure(extract_dir)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def extract_epub_to_tmp_dir(original_zip_file)
|
34
54
|
#unzip in temp dir
|
35
55
|
extract_dir = Dir.mktmpdir('epub_extract')
|
36
56
|
original_zip_file.entries.each do |e|
|
37
57
|
file_dir = File.split(e.name)[0]
|
38
|
-
|
58
|
+
FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.')
|
39
59
|
original_zip_file.extract(e, File.join(extract_dir, e.name))
|
40
60
|
end
|
41
61
|
|
62
|
+
extract_dir
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def chop_files_in_tmp_dir(extract_dir)
|
42
67
|
#fix spine files
|
43
68
|
filename_list = @resource_word_count.keys
|
44
69
|
filename_list.each do |filename|
|
@@ -50,27 +75,39 @@ module EPUBChop
|
|
50
75
|
FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
|
51
76
|
FileUtils.touch "#{extract_dir}/#{filename}"
|
52
77
|
File.open("#{extract_dir}/#{filename}", 'w') do |f|
|
53
|
-
f.puts
|
78
|
+
f.puts empty_file_with_cover(filename)
|
54
79
|
end
|
55
80
|
|
56
81
|
else
|
82
|
+
#noinspection RubyResolve
|
57
83
|
resource = Nokogiri::XML(@book.table_of_contents.resources[filename]) do |config|
|
58
84
|
config.noblanks.nonet
|
59
85
|
end
|
60
86
|
resource.css('script').remove
|
61
87
|
resource.css('style').remove
|
62
88
|
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
63
|
-
resource_text_length = resource_text.length
|
89
|
+
#resource_text_length = resource_text.length
|
64
90
|
|
65
91
|
# get a string that can be found
|
66
92
|
data = nil
|
67
|
-
window_begin = 5
|
93
|
+
window_begin = default_window_begin = 5
|
68
94
|
window_end = 0
|
69
95
|
while data.nil?
|
70
|
-
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
|
71
|
-
|
72
|
-
|
73
|
-
|
96
|
+
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
|
97
|
+
|
98
|
+
if look_for.nil?
|
99
|
+
window_begin = default_window_begin += 5
|
100
|
+
window_end = 0
|
101
|
+
else
|
102
|
+
data = resource.at_css("*:contains('#{look_for.join(' ')}')")
|
103
|
+
window_begin -= 1
|
104
|
+
window_end += 1
|
105
|
+
|
106
|
+
if window_begin == window_end
|
107
|
+
window_begin = default_window_begin += 5
|
108
|
+
window_end = 0
|
109
|
+
end
|
110
|
+
end
|
74
111
|
end
|
75
112
|
|
76
113
|
#limit on found string
|
@@ -92,8 +129,9 @@ module EPUBChop
|
|
92
129
|
end
|
93
130
|
end
|
94
131
|
end
|
95
|
-
|
132
|
+
end
|
96
133
|
|
134
|
+
def rebuild_epub_from_tmp_dir(extract_dir)
|
97
135
|
#zip new ebook
|
98
136
|
new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
|
99
137
|
new_ebook_name_path = new_ebook_name.path
|
@@ -101,24 +139,56 @@ module EPUBChop
|
|
101
139
|
|
102
140
|
zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)
|
103
141
|
|
104
|
-
Dir[File.join(extract_dir, '**', '**')]
|
142
|
+
epub_files = Dir[File.join(extract_dir, '**', '**')]
|
143
|
+
|
144
|
+
#minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
|
145
|
+
mimetype = epub_files.delete("#{extract_dir}/mimetype")
|
146
|
+
mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
|
147
|
+
zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
|
148
|
+
|
149
|
+
#all the other files
|
150
|
+
epub_files.each do |file|
|
105
151
|
zipfile.add(file.sub("#{extract_dir}/", ''), file)
|
106
152
|
end
|
107
153
|
zipfile.close
|
108
154
|
|
109
|
-
|
110
|
-
|
111
|
-
raise RuntimeError, ''
|
112
|
-
rescue Exception => e
|
113
|
-
puts "Chopping went wrong. #{e.message}"
|
114
|
-
puts e.backtrace
|
155
|
+
new_ebook_name_path
|
156
|
+
end
|
115
157
|
|
116
|
-
|
117
|
-
|
118
|
-
|
158
|
+
#noinspection RubyInstanceMethodNamingConvention
|
159
|
+
def remove_unused_media_from_tmp_dir(extract_dir)
|
160
|
+
#TODO: remove other media
|
161
|
+
#TODO: rebuild toc.ncx and content.opf
|
162
|
+
remove_unused_images_from_tmp_dir(extract_dir)
|
163
|
+
end
|
164
|
+
|
165
|
+
#noinspection RubyInstanceMethodNamingConvention
|
166
|
+
def remove_unused_images_from_tmp_dir(extract_dir)
|
167
|
+
puts 'removing unused media'
|
168
|
+
not_to_be_deleted_images = []
|
169
|
+
all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
|
170
|
+
@book.table_of_contents.resources.html.each do |resource|
|
171
|
+
file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
|
172
|
+
|
173
|
+
all_images.each do |image|
|
174
|
+
i = image.split('/').last
|
175
|
+
data = file.at_css("img[src$='#{i}']")
|
176
|
+
|
177
|
+
if data
|
178
|
+
not_to_be_deleted_images << image
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
to_be_deleted_images = (all_images - not_to_be_deleted_images)
|
184
|
+
to_be_deleted_images.each do |image|
|
185
|
+
puts "\t\tremoving #{image}"
|
186
|
+
File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
|
187
|
+
end
|
188
|
+
|
189
|
+
to_be_deleted_images
|
119
190
|
end
|
120
191
|
|
121
|
-
private
|
122
192
|
|
123
193
|
def set_defaults(options)
|
124
194
|
@words = options[:words] || 10
|
@@ -130,34 +200,69 @@ module EPUBChop
|
|
130
200
|
@text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
|
131
201
|
@text2 = ''
|
132
202
|
end
|
203
|
+
|
204
|
+
@chop_by = options[:chop_by] || :spine
|
133
205
|
end
|
134
206
|
|
135
|
-
def
|
207
|
+
def empty_file_with_cover(filename)
|
208
|
+
number_of_subdirectories = filename.split('/').size - 1
|
209
|
+
|
210
|
+
cover_path = ''
|
211
|
+
number_of_subdirectories.times{ cover_path += '../'}
|
212
|
+
|
213
|
+
cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
|
214
|
+
|
136
215
|
data = <<DATA
|
137
216
|
<?xml version="1.0" encoding="utf-8" standalone="no"?>
|
138
|
-
<!DOCTYPE html>
|
217
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
139
218
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
140
|
-
<head>
|
141
|
-
<title>Read more</title>
|
142
|
-
</head>
|
143
|
-
|
144
|
-
<
|
145
|
-
<div style=
|
146
|
-
<div
|
147
|
-
<
|
148
|
-
</
|
149
|
-
</
|
219
|
+
<head>
|
220
|
+
<title>Read more</title>
|
221
|
+
</head>
|
222
|
+
|
223
|
+
<body>
|
224
|
+
<div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;">
|
225
|
+
<div style='text-align:center;'>
|
226
|
+
<h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2>
|
227
|
+
<span>#{CGI.escape_html(@text2 ? @text2 : '')}</span>
|
228
|
+
</div>
|
229
|
+
|
230
|
+
<div style="margin-top:20px;">
|
231
|
+
<div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;">
|
232
|
+
<img src="#{cover_path}" alt="" style="width:100%" />
|
233
|
+
</div>
|
234
|
+
|
235
|
+
<div style='padding-top:10px;'>
|
236
|
+
<h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '' )}</h3>
|
237
|
+
</div>
|
238
|
+
|
239
|
+
<div>
|
240
|
+
<h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4>
|
241
|
+
</div>
|
242
|
+
|
243
|
+
</div>
|
244
|
+
|
245
|
+
<br />
|
246
|
+
|
247
|
+
<div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div>
|
248
|
+
</div>
|
150
249
|
</body>
|
151
250
|
</html>
|
251
|
+
|
152
252
|
DATA
|
253
|
+
|
254
|
+
data
|
153
255
|
end
|
154
256
|
|
155
257
|
def count_words(input)
|
156
258
|
@book = EPUBInfo.get(input)
|
157
259
|
resource_word_count = {}
|
158
260
|
if @book
|
159
|
-
@book.table_of_contents.resources.
|
261
|
+
chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
|
262
|
+
|
263
|
+
chop_by.each do |resource|
|
160
264
|
raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
|
265
|
+
#noinspection RubyResolve
|
161
266
|
config.noblanks.nonet
|
162
267
|
end
|
163
268
|
raw.css('script').remove
|
@@ -187,11 +292,14 @@ DATA
|
|
187
292
|
resource_allowed_word_count = @resource_word_count.select do |r|
|
188
293
|
(word_counter += @resource_word_count[r]) < allowed_words
|
189
294
|
end
|
295
|
+
|
190
296
|
word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }
|
191
297
|
|
192
298
|
how_many_words_left = allowed_words - word_counter
|
193
299
|
if how_many_words_left > 0
|
194
300
|
resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]
|
301
|
+
|
302
|
+
#noinspection RubyLocalVariableNamingConvention
|
195
303
|
word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
|
196
304
|
if how_many_words_left < word_count_of_resource_to_split
|
197
305
|
resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)
|
data/lib/EPUBChop/version.rb
CHANGED
Binary file
|
data/spec/epubchop_spec.rb
CHANGED
@@ -4,7 +4,7 @@ require 'spec_helper'
|
|
4
4
|
describe 'EPUBChop' do
|
5
5
|
before(:all) do
|
6
6
|
#chop EPUB at 10% of total words
|
7
|
-
@chop = EPUBChop.get('./spec/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
|
7
|
+
@chop = EPUBChop.get('./spec/epub/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
|
8
8
|
end
|
9
9
|
|
10
10
|
it 'load an epub' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: EPUBChop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,10 +114,11 @@ files:
|
|
114
114
|
- lib/EPUBChop/chop.rb
|
115
115
|
- lib/EPUBChop/version.rb
|
116
116
|
- lib/trollop.rb
|
117
|
-
- spec/Verne_20000_West_pg11393.epub
|
117
|
+
- spec/epub/Verne_20000_West_pg11393.epub
|
118
|
+
- spec/epub/default.epub
|
118
119
|
- spec/epubchop_spec.rb
|
119
120
|
- spec/spec_helper.rb
|
120
|
-
homepage:
|
121
|
+
homepage: https://github.com/mehmetc/EPUBChop
|
121
122
|
licenses:
|
122
123
|
- MIT
|
123
124
|
metadata: {}
|
@@ -142,7 +143,7 @@ signing_key:
|
|
142
143
|
specification_version: 4
|
143
144
|
summary: Removes unwanted content from an EPUB
|
144
145
|
test_files:
|
145
|
-
- spec/Verne_20000_West_pg11393.epub
|
146
|
+
- spec/epub/Verne_20000_West_pg11393.epub
|
147
|
+
- spec/epub/default.epub
|
146
148
|
- spec/epubchop_spec.rb
|
147
149
|
- spec/spec_helper.rb
|
148
|
-
has_rdoc:
|
File without changes
|