EPUBChop 0.0.1 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NTk5MTRjMjQ1ZDk2YTEzM2E3MWNkN2ViODEzYzdlYWQ4ODE0YWE4NA==
4
+ MmM1ZTY5M2E0NjMwN2ViZDFkYzUzODAyZmZhN2VmOGVkZTkwNTNkYQ==
5
5
  data.tar.gz: !binary |-
6
- YzYyMTA1YmYzOWY1N2UwNWRlM2ZiOTIwMmU0NTRhNGQ2YjE3YTc3Nw==
6
+ N2IwZThjYjA2Yjg3YjRhZDBiOTliMDY5Y2FmMzJmMjc4YTI3NGI3MA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MjI4ZDgzNWY0NzI3NmQ4YzAzNzNlNTZkMGMzZDA4MjIwMmVhOTQ4Y2EwODI5
10
- ZWZjNDNkNDRhOWI4N2YxYWMxZjc5MDU2MjFkMTIzYWQ2MTk5YmI2YTczZjEx
11
- ZWFjMWY4YTgwN2FkYjJiOGNlYTJhNTk5ZGY3N2VlZDE5MGU0NWM=
9
+ MTg1YWMwYmU5NzI1ZTEwZTQxNzJlM2M0YzU3MGY2ZGZjZTE4NDk1ZmJmMDkw
10
+ OTRkN2M0OTQ3ZDQxYThhMjRlN2ZhZjA5ZGExODIyNTg1NDczNTQ4MWM0MjU4
11
+ MzdmNDMyMWNiMDNhNzQ4ZmQ3NDY3ODgxNzQ2MjQ2ZDI5MmI1MGY=
12
12
  data.tar.gz: !binary |-
13
- YWYyZDNiMDFkOGExMWY5MGFiM2Y4MDdkYWQ3MWNlMDkxMTQ2MDkzNzIxOWE1
14
- Mzg3ZDBhNGVmMGJiZmQ0ODgzMmQzYmFkZWU2ZmNhYmZkMjUxMzc0NDQzNzE1
15
- YWNhOGZhMTVmMzhiODFhMTY1ODAzNGE2MDI5YmE5MzI4MGI3ZmI=
13
+ NTM0NzdkZDNmY2E3MzkwOGU0ODQxZjA5YmQ4ZWRjNDM1N2JlYzhkY2Q5YTcy
14
+ MDZiMjUyZDZmNzY4NDE1YzJhZTA1NzY0MjUzOWQ1ZDc3ZmQ3N2FkMzBjYjZm
15
+ NzQ3NGM2ZDUyODU0ZmEwNTA4OTQ4NGUwYTJlNDgzNmJlNDUzYjg=
data/EPUBChop.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ["mehmet@celik.be"]
11
11
  spec.description = %q{Create EPUB previews}
12
12
  spec.summary = %q{Removes unwanted content from an EPUB}
13
- spec.homepage = ""
13
+ spec.homepage = "https://github.com/mehmetc/EPUBChop"
14
14
  spec.license = "MIT"
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
data/README.md CHANGED
@@ -19,10 +19,14 @@ where [options] are:
19
19
  ```
20
20
 
21
21
  ### Example:
22
+ Create a new EPUB with 10% of the content all other pages should contain the lines "Want to read more? Buy the book!"
22
23
  ```ruby
23
24
  epubchop --words 10 --base percentage -line1 "Want to read more?" -line2 "Buy the book!" my.epub
24
25
  ```
25
26
 
27
+ This gem depends on [![epubinfo](http://github.com/chdorner/epubinfo)] I made some additions to the gem but they are still in a branch. Until they get accepted I'll be using the [![epubinfo_with_toc](https://github.com/mehmetc/epubinfo/tree/table_of_contents)]
28
+ gem.
29
+
26
30
  ## Contributing to EPUBChop
27
31
  * Fork the project.
28
32
  * Create a new branch to implement your bugfixes or features
data/bin/epubchop CHANGED
@@ -18,6 +18,7 @@ BANNER
18
18
  opt :base, "How to interprete the --words options... Possible value: percentage", :type => :string, :default => 'percentage'
19
19
  opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
20
20
  opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
21
+ opt :chop_by, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => :spine
21
22
  end
22
23
 
23
24
  Trollop::die "need an EPUB file name" if ARGV.empty?
@@ -30,11 +31,12 @@ begin
30
31
  text = []
31
32
  text << options[:line1] if options.has_key?(:line1)
32
33
  text << options[:line2] if options.has_key?(:line2)
34
+ chop_by << options[:chop_by]
33
35
 
34
36
  puts 'loading EPUB'
35
37
  b=EPUBChop.get(filename)
36
38
  puts 'chopping EPUB'
37
- c=b.chop({:base => base.to_s, :words => words, :text => text})
39
+ c=b.chop({:base => base.to_s, :words => words, :text => text, :chop_by => chop_by})
38
40
  puts 'rebuilding EPUB'
39
41
  FileUtils.move(c, "chopped_#{File.basename(filename)}")
40
42
 
data/lib/EPUBChop/chop.rb CHANGED
@@ -10,7 +10,6 @@ module EPUBChop
10
10
  def initialize(input, options ={})
11
11
  set_defaults(options)
12
12
 
13
-
14
13
  raise 'Please supply an input file name' if input.nil?
15
14
 
16
15
  #count the number of words in a file
@@ -31,14 +30,40 @@ module EPUBChop
31
30
  set_defaults(options)
32
31
 
33
32
  original_zip_file = @book.table_of_contents.parser.zip_file
33
+ extract_dir = extract_epub_to_tmp_dir(original_zip_file)
34
+
35
+ chop_files_in_tmp_dir(extract_dir)
36
+ remove_unused_media_from_tmp_dir(extract_dir)
37
+
38
+
39
+ return rebuild_epub_from_tmp_dir(extract_dir)
40
+ rescue Zip::ZipError => e
41
+ raise RuntimeError, "Error processing EPUB. #{e.message}"
42
+ rescue Exception => e
43
+ puts "Chopping went wrong. #{e.message}"
44
+ puts e.backtrace
45
+
46
+ return nil
47
+ ensure
48
+ FileUtils.remove_entry_secure(extract_dir)
49
+ end
50
+
51
+ private
52
+
53
+ def extract_epub_to_tmp_dir(original_zip_file)
34
54
  #unzip in temp dir
35
55
  extract_dir = Dir.mktmpdir('epub_extract')
36
56
  original_zip_file.entries.each do |e|
37
57
  file_dir = File.split(e.name)[0]
38
- Dir.mkdir(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?(".")
58
+ FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.')
39
59
  original_zip_file.extract(e, File.join(extract_dir, e.name))
40
60
  end
41
61
 
62
+ extract_dir
63
+ end
64
+
65
+
66
+ def chop_files_in_tmp_dir(extract_dir)
42
67
  #fix spine files
43
68
  filename_list = @resource_word_count.keys
44
69
  filename_list.each do |filename|
@@ -50,27 +75,39 @@ module EPUBChop
50
75
  FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
51
76
  FileUtils.touch "#{extract_dir}/#{filename}"
52
77
  File.open("#{extract_dir}/#{filename}", 'w') do |f|
53
- f.puts empty_file
78
+ f.puts empty_file_with_cover(filename)
54
79
  end
55
80
 
56
81
  else
82
+ #noinspection RubyResolve
57
83
  resource = Nokogiri::XML(@book.table_of_contents.resources[filename]) do |config|
58
84
  config.noblanks.nonet
59
85
  end
60
86
  resource.css('script').remove
61
87
  resource.css('style').remove
62
88
  resource_text = resource.at_css('body').text.split[0..processed_file_size]
63
- resource_text_length = resource_text.length
89
+ #resource_text_length = resource_text.length
64
90
 
65
91
  # get a string that can be found
66
92
  data = nil
67
- window_begin = 5
93
+ window_begin = default_window_begin = 5
68
94
  window_end = 0
69
95
  while data.nil?
70
- look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)].join(' ')
71
- data = resource.at_css("p:contains('#{look_for}')")
72
- window_begin += 1
73
- window_end += 1
96
+ look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
97
+
98
+ if look_for.nil?
99
+ window_begin = default_window_begin += 5
100
+ window_end = 0
101
+ else
102
+ data = resource.at_css("*:contains('#{look_for.join(' ')}')")
103
+ window_begin -= 1
104
+ window_end += 1
105
+
106
+ if window_begin == window_end
107
+ window_begin = default_window_begin += 5
108
+ window_end = 0
109
+ end
110
+ end
74
111
  end
75
112
 
76
113
  #limit on found string
@@ -92,8 +129,9 @@ module EPUBChop
92
129
  end
93
130
  end
94
131
  end
95
- #TODO:remove unwanted media
132
+ end
96
133
 
134
+ def rebuild_epub_from_tmp_dir(extract_dir)
97
135
  #zip new ebook
98
136
  new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
99
137
  new_ebook_name_path = new_ebook_name.path
@@ -101,24 +139,56 @@ module EPUBChop
101
139
 
102
140
  zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)
103
141
 
104
- Dir[File.join(extract_dir, '**', '**')].each do |file|
142
+ epub_files = Dir[File.join(extract_dir, '**', '**')]
143
+
144
+ #minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
145
+ mimetype = epub_files.delete("#{extract_dir}/mimetype")
146
+ mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
147
+ zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
148
+
149
+ #all the other files
150
+ epub_files.each do |file|
105
151
  zipfile.add(file.sub("#{extract_dir}/", ''), file)
106
152
  end
107
153
  zipfile.close
108
154
 
109
- return new_ebook_name_path
110
- rescue Zip::ZipError => e
111
- raise RuntimeError, ''
112
- rescue Exception => e
113
- puts "Chopping went wrong. #{e.message}"
114
- puts e.backtrace
155
+ new_ebook_name_path
156
+ end
115
157
 
116
- return nil
117
- ensure
118
- FileUtils.remove_entry_secure(extract_dir)
158
+ #noinspection RubyInstanceMethodNamingConvention
159
+ def remove_unused_media_from_tmp_dir(extract_dir)
160
+ #TODO: remove other media
161
+ #TODO: rebuild toc.ncx and content.opf
162
+ remove_unused_images_from_tmp_dir(extract_dir)
163
+ end
164
+
165
+ #noinspection RubyInstanceMethodNamingConvention
166
+ def remove_unused_images_from_tmp_dir(extract_dir)
167
+ puts 'removing unused media'
168
+ not_to_be_deleted_images = []
169
+ all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
170
+ @book.table_of_contents.resources.html.each do |resource|
171
+ file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
172
+
173
+ all_images.each do |image|
174
+ i = image.split('/').last
175
+ data = file.at_css("img[src$='#{i}']")
176
+
177
+ if data
178
+ not_to_be_deleted_images << image
179
+ end
180
+ end
181
+ end
182
+
183
+ to_be_deleted_images = (all_images - not_to_be_deleted_images)
184
+ to_be_deleted_images.each do |image|
185
+ puts "\t\tremoving #{image}"
186
+ File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
187
+ end
188
+
189
+ to_be_deleted_images
119
190
  end
120
191
 
121
- private
122
192
 
123
193
  def set_defaults(options)
124
194
  @words = options[:words] || 10
@@ -130,34 +200,69 @@ module EPUBChop
130
200
  @text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
131
201
  @text2 = ''
132
202
  end
203
+
204
+ @chop_by = options[:chop_by] || :spine
133
205
  end
134
206
 
135
- def empty_file
207
+ def empty_file_with_cover(filename)
208
+ number_of_subdirectories = filename.split('/').size - 1
209
+
210
+ cover_path = ''
211
+ number_of_subdirectories.times{ cover_path += '../'}
212
+
213
+ cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
214
+
136
215
  data = <<DATA
137
216
  <?xml version="1.0" encoding="utf-8" standalone="no"?>
138
- <!DOCTYPE html>
217
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
139
218
  <html xmlns="http://www.w3.org/1999/xhtml">
140
- <head>
141
- <title>Read more</title>
142
- </head>
143
- <body>
144
- <center>
145
- <div style='width:100%;border:1px solid black;margin-top:20px;padding:5px'>
146
- <div><h2>#{@text1}</h2></div>
147
- <div><h2>#{@text2}</h2></div>
148
- </div>
149
- </center>
219
+ <head>
220
+ <title>Read more</title>
221
+ </head>
222
+
223
+ <body>
224
+ <div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;">
225
+ <div style='text-align:center;'>
226
+ <h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2>
227
+ <span>#{CGI.escape_html(@text2 ? @text2 : '')}</span>
228
+ </div>
229
+
230
+ <div style="margin-top:20px;">
231
+ <div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;">
232
+ <img src="#{cover_path}" alt="" style="width:100%" />
233
+ </div>
234
+
235
+ <div style='padding-top:10px;'>
236
+ <h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '' )}</h3>
237
+ </div>
238
+
239
+ <div>
240
+ <h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4>
241
+ </div>
242
+
243
+ </div>
244
+
245
+ <br />
246
+
247
+ <div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div>
248
+ </div>
150
249
  </body>
151
250
  </html>
251
+
152
252
  DATA
253
+
254
+ data
153
255
  end
154
256
 
155
257
  def count_words(input)
156
258
  @book = EPUBInfo.get(input)
157
259
  resource_word_count = {}
158
260
  if @book
159
- @book.table_of_contents.resources.spine.each do |resource|
261
+ chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
262
+
263
+ chop_by.each do |resource|
160
264
  raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
265
+ #noinspection RubyResolve
161
266
  config.noblanks.nonet
162
267
  end
163
268
  raw.css('script').remove
@@ -187,11 +292,14 @@ DATA
187
292
  resource_allowed_word_count = @resource_word_count.select do |r|
188
293
  (word_counter += @resource_word_count[r]) < allowed_words
189
294
  end
295
+
190
296
  word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }
191
297
 
192
298
  how_many_words_left = allowed_words - word_counter
193
299
  if how_many_words_left > 0
194
300
  resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]
301
+
302
+ #noinspection RubyLocalVariableNamingConvention
195
303
  word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
196
304
  if how_many_words_left < word_count_of_resource_to_split
197
305
  resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)
@@ -1,3 +1,3 @@
1
1
  module EPUBChop
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.6"
3
3
  end
Binary file
@@ -4,7 +4,7 @@ require 'spec_helper'
4
4
  describe 'EPUBChop' do
5
5
  before(:all) do
6
6
  #chop EPUB at 10% of total words
7
- @chop = EPUBChop.get('./spec/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
7
+ @chop = EPUBChop.get('./spec/epub/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
8
8
  end
9
9
 
10
10
  it 'load an epub' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: EPUBChop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-28 00:00:00.000000000 Z
11
+ date: 2014-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,10 +114,11 @@ files:
114
114
  - lib/EPUBChop/chop.rb
115
115
  - lib/EPUBChop/version.rb
116
116
  - lib/trollop.rb
117
- - spec/Verne_20000_West_pg11393.epub
117
+ - spec/epub/Verne_20000_West_pg11393.epub
118
+ - spec/epub/default.epub
118
119
  - spec/epubchop_spec.rb
119
120
  - spec/spec_helper.rb
120
- homepage: ''
121
+ homepage: https://github.com/mehmetc/EPUBChop
121
122
  licenses:
122
123
  - MIT
123
124
  metadata: {}
@@ -142,7 +143,7 @@ signing_key:
142
143
  specification_version: 4
143
144
  summary: Removes unwanted content from an EPUB
144
145
  test_files:
145
- - spec/Verne_20000_West_pg11393.epub
146
+ - spec/epub/Verne_20000_West_pg11393.epub
147
+ - spec/epub/default.epub
146
148
  - spec/epubchop_spec.rb
147
149
  - spec/spec_helper.rb
148
- has_rdoc: