EPUBChop 0.0.1 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NTk5MTRjMjQ1ZDk2YTEzM2E3MWNkN2ViODEzYzdlYWQ4ODE0YWE4NA==
4
+ MmM1ZTY5M2E0NjMwN2ViZDFkYzUzODAyZmZhN2VmOGVkZTkwNTNkYQ==
5
5
  data.tar.gz: !binary |-
6
- YzYyMTA1YmYzOWY1N2UwNWRlM2ZiOTIwMmU0NTRhNGQ2YjE3YTc3Nw==
6
+ N2IwZThjYjA2Yjg3YjRhZDBiOTliMDY5Y2FmMzJmMjc4YTI3NGI3MA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MjI4ZDgzNWY0NzI3NmQ4YzAzNzNlNTZkMGMzZDA4MjIwMmVhOTQ4Y2EwODI5
10
- ZWZjNDNkNDRhOWI4N2YxYWMxZjc5MDU2MjFkMTIzYWQ2MTk5YmI2YTczZjEx
11
- ZWFjMWY4YTgwN2FkYjJiOGNlYTJhNTk5ZGY3N2VlZDE5MGU0NWM=
9
+ MTg1YWMwYmU5NzI1ZTEwZTQxNzJlM2M0YzU3MGY2ZGZjZTE4NDk1ZmJmMDkw
10
+ OTRkN2M0OTQ3ZDQxYThhMjRlN2ZhZjA5ZGExODIyNTg1NDczNTQ4MWM0MjU4
11
+ MzdmNDMyMWNiMDNhNzQ4ZmQ3NDY3ODgxNzQ2MjQ2ZDI5MmI1MGY=
12
12
  data.tar.gz: !binary |-
13
- YWYyZDNiMDFkOGExMWY5MGFiM2Y4MDdkYWQ3MWNlMDkxMTQ2MDkzNzIxOWE1
14
- Mzg3ZDBhNGVmMGJiZmQ0ODgzMmQzYmFkZWU2ZmNhYmZkMjUxMzc0NDQzNzE1
15
- YWNhOGZhMTVmMzhiODFhMTY1ODAzNGE2MDI5YmE5MzI4MGI3ZmI=
13
+ NTM0NzdkZDNmY2E3MzkwOGU0ODQxZjA5YmQ4ZWRjNDM1N2JlYzhkY2Q5YTcy
14
+ MDZiMjUyZDZmNzY4NDE1YzJhZTA1NzY0MjUzOWQ1ZDc3ZmQ3N2FkMzBjYjZm
15
+ NzQ3NGM2ZDUyODU0ZmEwNTA4OTQ4NGUwYTJlNDgzNmJlNDUzYjg=
data/EPUBChop.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ["mehmet@celik.be"]
11
11
  spec.description = %q{Create EPUB previews}
12
12
  spec.summary = %q{Removes unwanted content from an EPUB}
13
- spec.homepage = ""
13
+ spec.homepage = "https://github.com/mehmetc/EPUBChop"
14
14
  spec.license = "MIT"
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
data/README.md CHANGED
@@ -19,10 +19,14 @@ where [options] are:
19
19
  ```
20
20
 
21
21
  ### Example:
22
+ Create a new EPUB with 10% of the content all other pages should contain the lines "Want to read more? Buy the book!"
22
23
  ```ruby
23
24
  epubchop --words 10 --base percentage -line1 "Want to read more?" -line2 "Buy the book!" my.epub
24
25
  ```
25
26
 
27
+ This gem depends on [![epubinfo](http://github.com/chdorner/epubinfo)] I made some additions to the gem but they are still in a branch. Until they get accepted I'll be using the [![epubinfo_with_toc](https://github.com/mehmetc/epubinfo/tree/table_of_contents)]
28
+ gem.
29
+
26
30
  ## Contributing to EPUBChop
27
31
  * Fork the project.
28
32
  * Create a new branch to implement your bugfixes or features
data/bin/epubchop CHANGED
@@ -18,6 +18,7 @@ BANNER
18
18
  opt :base, "How to interprete the --words options... Possible value: percentage", :type => :string, :default => 'percentage'
19
19
  opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
20
20
  opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
21
+ opt :chop_by, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => :spine
21
22
  end
22
23
 
23
24
  Trollop::die "need an EPUB file name" if ARGV.empty?
@@ -30,11 +31,12 @@ begin
30
31
  text = []
31
32
  text << options[:line1] if options.has_key?(:line1)
32
33
  text << options[:line2] if options.has_key?(:line2)
34
+ chop_by << options[:chop_by]
33
35
 
34
36
  puts 'loading EPUB'
35
37
  b=EPUBChop.get(filename)
36
38
  puts 'chopping EPUB'
37
- c=b.chop({:base => base.to_s, :words => words, :text => text})
39
+ c=b.chop({:base => base.to_s, :words => words, :text => text, :chop_by => chop_by})
38
40
  puts 'rebuilding EPUB'
39
41
  FileUtils.move(c, "chopped_#{File.basename(filename)}")
40
42
 
data/lib/EPUBChop/chop.rb CHANGED
@@ -10,7 +10,6 @@ module EPUBChop
10
10
  def initialize(input, options ={})
11
11
  set_defaults(options)
12
12
 
13
-
14
13
  raise 'Please supply an input file name' if input.nil?
15
14
 
16
15
  #count the number of words in a file
@@ -31,14 +30,40 @@ module EPUBChop
31
30
  set_defaults(options)
32
31
 
33
32
  original_zip_file = @book.table_of_contents.parser.zip_file
33
+ extract_dir = extract_epub_to_tmp_dir(original_zip_file)
34
+
35
+ chop_files_in_tmp_dir(extract_dir)
36
+ remove_unused_media_from_tmp_dir(extract_dir)
37
+
38
+
39
+ return rebuild_epub_from_tmp_dir(extract_dir)
40
+ rescue Zip::ZipError => e
41
+ raise RuntimeError, "Error processing EPUB. #{e.message}"
42
+ rescue Exception => e
43
+ puts "Chopping went wrong. #{e.message}"
44
+ puts e.backtrace
45
+
46
+ return nil
47
+ ensure
48
+ FileUtils.remove_entry_secure(extract_dir)
49
+ end
50
+
51
+ private
52
+
53
+ def extract_epub_to_tmp_dir(original_zip_file)
34
54
  #unzip in temp dir
35
55
  extract_dir = Dir.mktmpdir('epub_extract')
36
56
  original_zip_file.entries.each do |e|
37
57
  file_dir = File.split(e.name)[0]
38
- Dir.mkdir(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?(".")
58
+ FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.')
39
59
  original_zip_file.extract(e, File.join(extract_dir, e.name))
40
60
  end
41
61
 
62
+ extract_dir
63
+ end
64
+
65
+
66
+ def chop_files_in_tmp_dir(extract_dir)
42
67
  #fix spine files
43
68
  filename_list = @resource_word_count.keys
44
69
  filename_list.each do |filename|
@@ -50,27 +75,39 @@ module EPUBChop
50
75
  FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
51
76
  FileUtils.touch "#{extract_dir}/#{filename}"
52
77
  File.open("#{extract_dir}/#{filename}", 'w') do |f|
53
- f.puts empty_file
78
+ f.puts empty_file_with_cover(filename)
54
79
  end
55
80
 
56
81
  else
82
+ #noinspection RubyResolve
57
83
  resource = Nokogiri::XML(@book.table_of_contents.resources[filename]) do |config|
58
84
  config.noblanks.nonet
59
85
  end
60
86
  resource.css('script').remove
61
87
  resource.css('style').remove
62
88
  resource_text = resource.at_css('body').text.split[0..processed_file_size]
63
- resource_text_length = resource_text.length
89
+ #resource_text_length = resource_text.length
64
90
 
65
91
  # get a string that can be found
66
92
  data = nil
67
- window_begin = 5
93
+ window_begin = default_window_begin = 5
68
94
  window_end = 0
69
95
  while data.nil?
70
- look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)].join(' ')
71
- data = resource.at_css("p:contains('#{look_for}')")
72
- window_begin += 1
73
- window_end += 1
96
+ look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
97
+
98
+ if look_for.nil?
99
+ window_begin = default_window_begin += 5
100
+ window_end = 0
101
+ else
102
+ data = resource.at_css("*:contains('#{look_for.join(' ')}')")
103
+ window_begin -= 1
104
+ window_end += 1
105
+
106
+ if window_begin == window_end
107
+ window_begin = default_window_begin += 5
108
+ window_end = 0
109
+ end
110
+ end
74
111
  end
75
112
 
76
113
  #limit on found string
@@ -92,8 +129,9 @@ module EPUBChop
92
129
  end
93
130
  end
94
131
  end
95
- #TODO:remove unwanted media
132
+ end
96
133
 
134
+ def rebuild_epub_from_tmp_dir(extract_dir)
97
135
  #zip new ebook
98
136
  new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
99
137
  new_ebook_name_path = new_ebook_name.path
@@ -101,24 +139,56 @@ module EPUBChop
101
139
 
102
140
  zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)
103
141
 
104
- Dir[File.join(extract_dir, '**', '**')].each do |file|
142
+ epub_files = Dir[File.join(extract_dir, '**', '**')]
143
+
144
+ #minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
145
+ mimetype = epub_files.delete("#{extract_dir}/mimetype")
146
+ mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
147
+ zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
148
+
149
+ #all the other files
150
+ epub_files.each do |file|
105
151
  zipfile.add(file.sub("#{extract_dir}/", ''), file)
106
152
  end
107
153
  zipfile.close
108
154
 
109
- return new_ebook_name_path
110
- rescue Zip::ZipError => e
111
- raise RuntimeError, ''
112
- rescue Exception => e
113
- puts "Chopping went wrong. #{e.message}"
114
- puts e.backtrace
155
+ new_ebook_name_path
156
+ end
115
157
 
116
- return nil
117
- ensure
118
- FileUtils.remove_entry_secure(extract_dir)
158
+ #noinspection RubyInstanceMethodNamingConvention
159
+ def remove_unused_media_from_tmp_dir(extract_dir)
160
+ #TODO: remove other media
161
+ #TODO: rebuild toc.ncx and content.opf
162
+ remove_unused_images_from_tmp_dir(extract_dir)
163
+ end
164
+
165
+ #noinspection RubyInstanceMethodNamingConvention
166
+ def remove_unused_images_from_tmp_dir(extract_dir)
167
+ puts 'removing unused media'
168
+ not_to_be_deleted_images = []
169
+ all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
170
+ @book.table_of_contents.resources.html.each do |resource|
171
+ file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
172
+
173
+ all_images.each do |image|
174
+ i = image.split('/').last
175
+ data = file.at_css("img[src$='#{i}']")
176
+
177
+ if data
178
+ not_to_be_deleted_images << image
179
+ end
180
+ end
181
+ end
182
+
183
+ to_be_deleted_images = (all_images - not_to_be_deleted_images)
184
+ to_be_deleted_images.each do |image|
185
+ puts "\t\tremoving #{image}"
186
+ File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
187
+ end
188
+
189
+ to_be_deleted_images
119
190
  end
120
191
 
121
- private
122
192
 
123
193
  def set_defaults(options)
124
194
  @words = options[:words] || 10
@@ -130,34 +200,69 @@ module EPUBChop
130
200
  @text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
131
201
  @text2 = ''
132
202
  end
203
+
204
+ @chop_by = options[:chop_by] || :spine
133
205
  end
134
206
 
135
- def empty_file
207
+ def empty_file_with_cover(filename)
208
+ number_of_subdirectories = filename.split('/').size - 1
209
+
210
+ cover_path = ''
211
+ number_of_subdirectories.times{ cover_path += '../'}
212
+
213
+ cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
214
+
136
215
  data = <<DATA
137
216
  <?xml version="1.0" encoding="utf-8" standalone="no"?>
138
- <!DOCTYPE html>
217
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
139
218
  <html xmlns="http://www.w3.org/1999/xhtml">
140
- <head>
141
- <title>Read more</title>
142
- </head>
143
- <body>
144
- <center>
145
- <div style='width:100%;border:1px solid black;margin-top:20px;padding:5px'>
146
- <div><h2>#{@text1}</h2></div>
147
- <div><h2>#{@text2}</h2></div>
148
- </div>
149
- </center>
219
+ <head>
220
+ <title>Read more</title>
221
+ </head>
222
+
223
+ <body>
224
+ <div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;">
225
+ <div style='text-align:center;'>
226
+ <h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2>
227
+ <span>#{CGI.escape_html(@text2 ? @text2 : '')}</span>
228
+ </div>
229
+
230
+ <div style="margin-top:20px;">
231
+ <div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;">
232
+ <img src="#{cover_path}" alt="" style="width:100%" />
233
+ </div>
234
+
235
+ <div style='padding-top:10px;'>
236
+ <h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '' )}</h3>
237
+ </div>
238
+
239
+ <div>
240
+ <h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4>
241
+ </div>
242
+
243
+ </div>
244
+
245
+ <br />
246
+
247
+ <div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div>
248
+ </div>
150
249
  </body>
151
250
  </html>
251
+
152
252
  DATA
253
+
254
+ data
153
255
  end
154
256
 
155
257
  def count_words(input)
156
258
  @book = EPUBInfo.get(input)
157
259
  resource_word_count = {}
158
260
  if @book
159
- @book.table_of_contents.resources.spine.each do |resource|
261
+ chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
262
+
263
+ chop_by.each do |resource|
160
264
  raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
265
+ #noinspection RubyResolve
161
266
  config.noblanks.nonet
162
267
  end
163
268
  raw.css('script').remove
@@ -187,11 +292,14 @@ DATA
187
292
  resource_allowed_word_count = @resource_word_count.select do |r|
188
293
  (word_counter += @resource_word_count[r]) < allowed_words
189
294
  end
295
+
190
296
  word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }
191
297
 
192
298
  how_many_words_left = allowed_words - word_counter
193
299
  if how_many_words_left > 0
194
300
  resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]
301
+
302
+ #noinspection RubyLocalVariableNamingConvention
195
303
  word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
196
304
  if how_many_words_left < word_count_of_resource_to_split
197
305
  resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)
@@ -1,3 +1,3 @@
1
1
  module EPUBChop
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.6"
3
3
  end
Binary file
@@ -4,7 +4,7 @@ require 'spec_helper'
4
4
  describe 'EPUBChop' do
5
5
  before(:all) do
6
6
  #chop EPUB at 10% of total words
7
- @chop = EPUBChop.get('./spec/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
7
+ @chop = EPUBChop.get('./spec/epub/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
8
8
  end
9
9
 
10
10
  it 'load an epub' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: EPUBChop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-28 00:00:00.000000000 Z
11
+ date: 2014-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,10 +114,11 @@ files:
114
114
  - lib/EPUBChop/chop.rb
115
115
  - lib/EPUBChop/version.rb
116
116
  - lib/trollop.rb
117
- - spec/Verne_20000_West_pg11393.epub
117
+ - spec/epub/Verne_20000_West_pg11393.epub
118
+ - spec/epub/default.epub
118
119
  - spec/epubchop_spec.rb
119
120
  - spec/spec_helper.rb
120
- homepage: ''
121
+ homepage: https://github.com/mehmetc/EPUBChop
121
122
  licenses:
122
123
  - MIT
123
124
  metadata: {}
@@ -142,7 +143,7 @@ signing_key:
142
143
  specification_version: 4
143
144
  summary: Removes unwanted content from an EPUB
144
145
  test_files:
145
- - spec/Verne_20000_West_pg11393.epub
146
+ - spec/epub/Verne_20000_West_pg11393.epub
147
+ - spec/epub/default.epub
146
148
  - spec/epubchop_spec.rb
147
149
  - spec/spec_helper.rb
148
- has_rdoc: