EPUBChop 0.0.7 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/bin/epubchop +4 -2
- data/lib/EPUBChop/chop.rb +76 -51
- data/lib/EPUBChop/version.rb +1 -1
- metadata +40 -40
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
ZjdjY2JiNzkwY2RiZTIxMDMyM2M4YTU1MjY0OGMwOTZiYjI4NmY4Mw==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2ff6270a266184b41507e29e43994de705c56357
|
4
|
+
data.tar.gz: a257fd9d69ba6eb453c936626b509cc40596adf5
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
NDZiZGVjMGEwNWY5ZjgwZjA2ZjQ4OTM4MDBhMGRlNWQ3ODk5Njc1ODhlY2U2
|
11
|
-
ZmVjNmEwNGYwYTQ1ZGZiNDc0Nzc4ODliZWI0N2Y1MjcxNDQzMGY=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
ZTNlY2E2OTA4NTNkM2I0YWRjMDc1YTkxMWU5YjgyMDY4YWM0ODE2MGVjZjQ5
|
14
|
-
OGI2YmNjNWFmZjA1ZTI1OGViZjI0N2UyNjJiOTQ3MmRhOTUyOTYwZDIwNDc2
|
15
|
-
NGQ0ZWU1N2RhZDBmNGMyOGZlZWM1NzIwMDgzMGM3Y2FkZjdhYWI=
|
6
|
+
metadata.gz: 5b8d5d76d9aabb81e9f0afe92d2c0b3aa058b6be46ce9eb3dc6c61f15eceaaaa49a27a696a2063b6bc9158ddb4992be0020ba1f7072ff9b90c2cbdf73b286527
|
7
|
+
data.tar.gz: 47742a21f5173c931b35f82df4c4fa5ea2f23e2b040073a9f575f38cab819c34a100b3bf3f1dcdd6e05d88ebe3c44445c2dfadb5e378d6f509332371caa00577
|
data/bin/epubchop
CHANGED
@@ -19,6 +19,7 @@ BANNER
|
|
19
19
|
opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
|
20
20
|
opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
|
21
21
|
opt :chop, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => 'spine'
|
22
|
+
opt :verbose, "more loging true/false", :type => :boolean, :default => false
|
22
23
|
end
|
23
24
|
|
24
25
|
Trollop::die "need an EPUB file name" if ARGV.empty?
|
@@ -32,9 +33,10 @@ begin
|
|
32
33
|
text << options[:line1] if options.has_key?(:line1)
|
33
34
|
text << options[:line2] if options.has_key?(:line2)
|
34
35
|
chop_by = options[:chop]
|
36
|
+
verbose = options[:verbose]
|
35
37
|
|
36
|
-
puts
|
37
|
-
b=EPUBChop.get(filename, :chop_by => chop_by.to_sym)
|
38
|
+
puts "loading EPUB #{filename}"
|
39
|
+
b=EPUBChop.get(filename, :chop_by => chop_by.to_sym, :verbose => verbose)
|
38
40
|
puts 'chopping EPUB'
|
39
41
|
c=b.chop({:base => base.to_s, :words => words, :text => text})
|
40
42
|
puts 'rebuilding EPUB'
|
data/lib/EPUBChop/chop.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
require 'nokogiri'
|
2
3
|
require 'epubinfo'
|
3
4
|
require 'tempfile'
|
@@ -38,12 +39,10 @@ module EPUBChop
|
|
38
39
|
|
39
40
|
return rebuild_epub_from_tmp_dir(extract_dir)
|
40
41
|
rescue Zip::ZipError => e
|
41
|
-
raise RuntimeError, "Error processing EPUB. #{e.message}"
|
42
|
+
raise RuntimeError, "Error processing EPUB #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace
|
42
43
|
rescue Exception => e
|
43
|
-
puts
|
44
|
-
|
45
|
-
|
46
|
-
return nil
|
44
|
+
puts e.backtrace.join("\n")
|
45
|
+
raise RuntimeError, "Chopping went wrong for #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace
|
47
46
|
ensure
|
48
47
|
FileUtils.remove_entry_secure(extract_dir)
|
49
48
|
end
|
@@ -80,50 +79,18 @@ module EPUBChop
|
|
80
79
|
|
81
80
|
else
|
82
81
|
#noinspection RubyResolve
|
83
|
-
resource = Nokogiri::
|
82
|
+
resource = Nokogiri::HTML(@book.table_of_contents.resources[filename]) do |config|
|
83
|
+
#resource = Nokogiri::HTML.parse(@book.table_of_contents.resources[filename], 'UTF-8') do |config|
|
84
84
|
config.noblanks.nonet
|
85
85
|
end
|
86
|
-
resource.
|
87
|
-
resource.css('style').remove
|
88
|
-
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
89
|
-
#resource_text_length = resource_text.length
|
90
|
-
|
91
|
-
# get a string that can be found
|
92
|
-
data = nil
|
93
|
-
window_begin = default_window_begin = 5
|
94
|
-
window_end = 0
|
95
|
-
while data.nil?
|
96
|
-
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
|
97
|
-
|
98
|
-
if look_for.nil?
|
99
|
-
window_begin = default_window_begin += 5
|
100
|
-
window_end = 0
|
101
|
-
else
|
102
|
-
data = resource.at_css("*:contains('#{look_for.join(' ')}')")
|
103
|
-
window_begin -= 1
|
104
|
-
window_end += 1
|
105
|
-
|
106
|
-
if window_begin == window_end
|
107
|
-
window_begin = default_window_begin += 5
|
108
|
-
window_end = 0
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
86
|
+
resource.encoding = 'UTF-8'
|
112
87
|
|
113
|
-
|
114
|
-
if data
|
115
|
-
next_data = data.next_element
|
116
|
-
while next_data
|
117
|
-
in_resource = resource.css(next_data.css_path)
|
118
|
-
in_resource.remove
|
119
|
-
|
120
|
-
next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element
|
121
|
-
end
|
122
|
-
end
|
88
|
+
resource = chop_file(resource, processed_file_size)
|
123
89
|
|
124
90
|
#persist page
|
125
|
-
File.open("#{extract_dir}/#{filename}", 'w') do |f|
|
126
|
-
f.puts resource.to_xml(:save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
91
|
+
File.open("#{extract_dir}/#{filename}", 'w:UTF-8') do |f|
|
92
|
+
# f.puts resource.to_xml(:save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
93
|
+
f.puts resource.serialize(:encoding => 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
127
94
|
end
|
128
95
|
|
129
96
|
end
|
@@ -131,6 +98,61 @@ module EPUBChop
|
|
131
98
|
end
|
132
99
|
end
|
133
100
|
|
101
|
+
def chop_file(resource, processed_file_size)
|
102
|
+
#TODO: get a better algorithm to determine where to chop
|
103
|
+
return resource if resource.nil?
|
104
|
+
|
105
|
+
resource.css('script').remove
|
106
|
+
resource.css('style').remove
|
107
|
+
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
108
|
+
|
109
|
+
# get a string that can be found
|
110
|
+
data = nil
|
111
|
+
window_begin = default_window_begin = 5
|
112
|
+
window_end = 0
|
113
|
+
while data.nil?
|
114
|
+
puts "data window:#{(processed_file_size - window_begin)}..#{(processed_file_size - window_end)}" if @verbose
|
115
|
+
processed_window_begin = processed_file_size - window_begin
|
116
|
+
processed_window_end = processed_file_size - window_end
|
117
|
+
|
118
|
+
processed_window_begin = 0 if processed_window_begin < 0
|
119
|
+
processed_window_end = processed_file_size
|
120
|
+
|
121
|
+
look_for = resource_text[processed_window_begin..processed_window_end]
|
122
|
+
|
123
|
+
if look_for.nil?
|
124
|
+
window_begin = default_window_begin += 5
|
125
|
+
window_end = 0
|
126
|
+
else
|
127
|
+
look_for.map! {|m| m.gsub("'", "\'")}
|
128
|
+
data = resource.at_css("p:contains(\"#{look_for.join(' ')}\")")
|
129
|
+
data = resource.at_css("body:contains(\"#{look_for.join(' ')}\")") if data.nil?
|
130
|
+
|
131
|
+
window_begin -= 1
|
132
|
+
window_end += 1
|
133
|
+
|
134
|
+
if window_begin == window_end
|
135
|
+
window_begin = default_window_begin += 5
|
136
|
+
window_end = 0
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
#limit on found string
|
142
|
+
if data
|
143
|
+
next_data = data.next_element
|
144
|
+
while next_data
|
145
|
+
in_resource = resource.css(next_data.css_path)
|
146
|
+
in_resource.remove
|
147
|
+
|
148
|
+
next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
resource
|
153
|
+
end
|
154
|
+
|
155
|
+
|
134
156
|
def rebuild_epub_from_tmp_dir(extract_dir)
|
135
157
|
#zip new ebook
|
136
158
|
new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
|
@@ -143,7 +165,7 @@ module EPUBChop
|
|
143
165
|
|
144
166
|
#minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
|
145
167
|
mimetype = epub_files.delete("#{extract_dir}/mimetype")
|
146
|
-
mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
|
168
|
+
mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0, 0, Zip::Entry::STORED)
|
147
169
|
zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
|
148
170
|
|
149
171
|
#all the other files
|
@@ -164,13 +186,14 @@ module EPUBChop
|
|
164
186
|
|
165
187
|
#noinspection RubyInstanceMethodNamingConvention
|
166
188
|
def remove_unused_images_from_tmp_dir(extract_dir)
|
167
|
-
puts 'removing unused media'
|
189
|
+
puts 'removing unused media' if @verbose
|
168
190
|
not_to_be_deleted_images = []
|
169
|
-
all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
|
191
|
+
all_images = @book.table_of_contents.resources.images.map { |i| i[:uri] }
|
170
192
|
@book.table_of_contents.resources.html.each do |resource|
|
171
193
|
file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
|
172
194
|
|
173
195
|
all_images.each do |image|
|
196
|
+
next if image.nil?
|
174
197
|
i = image.split('/').last
|
175
198
|
data = file.at_css("img[src$='#{i}']")
|
176
199
|
|
@@ -182,7 +205,8 @@ module EPUBChop
|
|
182
205
|
|
183
206
|
to_be_deleted_images = (all_images - not_to_be_deleted_images)
|
184
207
|
to_be_deleted_images.each do |image|
|
185
|
-
|
208
|
+
next if image.nil?
|
209
|
+
puts "\t\tremoving #{image}" if @verbose
|
186
210
|
File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
|
187
211
|
end
|
188
212
|
|
@@ -202,13 +226,14 @@ module EPUBChop
|
|
202
226
|
end
|
203
227
|
|
204
228
|
@chop_by = options[:chop_by] || :spine
|
229
|
+
@verbose = options[:verbose] || false
|
205
230
|
end
|
206
231
|
|
207
232
|
def empty_file_with_cover(filename)
|
208
233
|
number_of_subdirectories = filename.split('/').size - 1
|
209
234
|
|
210
235
|
cover_path = ''
|
211
|
-
number_of_subdirectories.times{ cover_path += '../'}
|
236
|
+
number_of_subdirectories.times { cover_path += '../' }
|
212
237
|
|
213
238
|
cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
|
214
239
|
|
@@ -233,7 +258,7 @@ module EPUBChop
|
|
233
258
|
</div>
|
234
259
|
|
235
260
|
<div style='padding-top:10px;'>
|
236
|
-
<h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : ''
|
261
|
+
<h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '')}</h3>
|
237
262
|
</div>
|
238
263
|
|
239
264
|
<div>
|
@@ -259,7 +284,7 @@ DATA
|
|
259
284
|
resource_word_count = {}
|
260
285
|
if @book
|
261
286
|
resources = @book.table_of_contents.resources.to_a
|
262
|
-
chop_by = @chop_by.eql?(:ncx)
|
287
|
+
chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
|
263
288
|
|
264
289
|
chop_by.each do |resource|
|
265
290
|
raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
|
data/lib/EPUBChop/version.rb
CHANGED
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: EPUBChop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
-
|
15
|
+
version_requirements: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.3'
|
20
|
-
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
23
21
|
requirements:
|
24
22
|
- - ~>
|
25
23
|
- !ruby/object:Gem::Version
|
26
24
|
version: '1.3'
|
25
|
+
prerelease: false
|
26
|
+
type: :development
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
29
34
|
requirement: !ruby/object:Gem::Requirement
|
30
35
|
requirements:
|
31
|
-
- -
|
36
|
+
- - '>='
|
32
37
|
- !ruby/object:Gem::Version
|
33
38
|
version: '0'
|
34
|
-
type: :development
|
35
39
|
prerelease: false
|
40
|
+
type: :development
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
36
43
|
version_requirements: !ruby/object:Gem::Requirement
|
37
44
|
requirements:
|
38
|
-
- -
|
45
|
+
- - '>='
|
39
46
|
- !ruby/object:Gem::Version
|
40
47
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rspec
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
44
49
|
requirements:
|
45
|
-
- -
|
50
|
+
- - '>='
|
46
51
|
- !ruby/object:Gem::Version
|
47
52
|
version: '0'
|
48
|
-
type: :development
|
49
53
|
prerelease: false
|
54
|
+
type: :development
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: epubinfo_with_toc
|
50
57
|
version_requirements: !ruby/object:Gem::Requirement
|
51
58
|
requirements:
|
52
|
-
- -
|
59
|
+
- - '>='
|
53
60
|
- !ruby/object:Gem::Version
|
54
61
|
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: epubinfo_with_toc
|
57
62
|
requirement: !ruby/object:Gem::Requirement
|
58
63
|
requirements:
|
59
|
-
- -
|
64
|
+
- - '>='
|
60
65
|
- !ruby/object:Gem::Version
|
61
66
|
version: '0'
|
62
|
-
type: :runtime
|
63
67
|
prerelease: false
|
64
|
-
|
65
|
-
requirements:
|
66
|
-
- - ! '>='
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
68
|
+
type: :runtime
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rubyzip
|
71
|
-
|
71
|
+
version_requirements: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ~>
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '1.0'
|
76
|
-
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
76
|
+
requirement: !ruby/object:Gem::Requirement
|
79
77
|
requirements:
|
80
78
|
- - ~>
|
81
79
|
- !ruby/object:Gem::Version
|
82
80
|
version: '1.0'
|
81
|
+
prerelease: false
|
82
|
+
type: :runtime
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: nokogiri
|
85
|
-
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - '>='
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
|
-
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
90
|
+
requirement: !ruby/object:Gem::Requirement
|
93
91
|
requirements:
|
94
|
-
- -
|
92
|
+
- - '>='
|
95
93
|
- !ruby/object:Gem::Version
|
96
94
|
version: '0'
|
95
|
+
prerelease: false
|
96
|
+
type: :runtime
|
97
97
|
description: Create EPUB previews
|
98
98
|
email:
|
99
99
|
- mehmet@celik.be
|
@@ -122,24 +122,24 @@ homepage: https://github.com/mehmetc/EPUBChop
|
|
122
122
|
licenses:
|
123
123
|
- MIT
|
124
124
|
metadata: {}
|
125
|
-
post_install_message:
|
125
|
+
post_install_message:
|
126
126
|
rdoc_options: []
|
127
127
|
require_paths:
|
128
128
|
- lib
|
129
129
|
required_ruby_version: !ruby/object:Gem::Requirement
|
130
130
|
requirements:
|
131
|
-
- -
|
131
|
+
- - '>='
|
132
132
|
- !ruby/object:Gem::Version
|
133
133
|
version: '0'
|
134
134
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- -
|
136
|
+
- - '>='
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
139
|
requirements: []
|
140
|
-
rubyforge_project:
|
141
|
-
rubygems_version: 2.
|
142
|
-
signing_key:
|
140
|
+
rubyforge_project:
|
141
|
+
rubygems_version: 2.2.2
|
142
|
+
signing_key:
|
143
143
|
specification_version: 4
|
144
144
|
summary: Removes unwanted content from an EPUB
|
145
145
|
test_files:
|