EPUBChop 0.0.7 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/bin/epubchop +4 -2
- data/lib/EPUBChop/chop.rb +76 -51
- data/lib/EPUBChop/version.rb +1 -1
- metadata +40 -40
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
ZjdjY2JiNzkwY2RiZTIxMDMyM2M4YTU1MjY0OGMwOTZiYjI4NmY4Mw==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2ff6270a266184b41507e29e43994de705c56357
|
4
|
+
data.tar.gz: a257fd9d69ba6eb453c936626b509cc40596adf5
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
NDZiZGVjMGEwNWY5ZjgwZjA2ZjQ4OTM4MDBhMGRlNWQ3ODk5Njc1ODhlY2U2
|
11
|
-
ZmVjNmEwNGYwYTQ1ZGZiNDc0Nzc4ODliZWI0N2Y1MjcxNDQzMGY=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
ZTNlY2E2OTA4NTNkM2I0YWRjMDc1YTkxMWU5YjgyMDY4YWM0ODE2MGVjZjQ5
|
14
|
-
OGI2YmNjNWFmZjA1ZTI1OGViZjI0N2UyNjJiOTQ3MmRhOTUyOTYwZDIwNDc2
|
15
|
-
NGQ0ZWU1N2RhZDBmNGMyOGZlZWM1NzIwMDgzMGM3Y2FkZjdhYWI=
|
6
|
+
metadata.gz: 5b8d5d76d9aabb81e9f0afe92d2c0b3aa058b6be46ce9eb3dc6c61f15eceaaaa49a27a696a2063b6bc9158ddb4992be0020ba1f7072ff9b90c2cbdf73b286527
|
7
|
+
data.tar.gz: 47742a21f5173c931b35f82df4c4fa5ea2f23e2b040073a9f575f38cab819c34a100b3bf3f1dcdd6e05d88ebe3c44445c2dfadb5e378d6f509332371caa00577
|
data/bin/epubchop
CHANGED
@@ -19,6 +19,7 @@ BANNER
|
|
19
19
|
opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
|
20
20
|
opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
|
21
21
|
opt :chop, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => 'spine'
|
22
|
+
opt :verbose, "more loging true/false", :type => :boolean, :default => false
|
22
23
|
end
|
23
24
|
|
24
25
|
Trollop::die "need an EPUB file name" if ARGV.empty?
|
@@ -32,9 +33,10 @@ begin
|
|
32
33
|
text << options[:line1] if options.has_key?(:line1)
|
33
34
|
text << options[:line2] if options.has_key?(:line2)
|
34
35
|
chop_by = options[:chop]
|
36
|
+
verbose = options[:verbose]
|
35
37
|
|
36
|
-
puts
|
37
|
-
b=EPUBChop.get(filename, :chop_by => chop_by.to_sym)
|
38
|
+
puts "loading EPUB #{filename}"
|
39
|
+
b=EPUBChop.get(filename, :chop_by => chop_by.to_sym, :verbose => verbose)
|
38
40
|
puts 'chopping EPUB'
|
39
41
|
c=b.chop({:base => base.to_s, :words => words, :text => text})
|
40
42
|
puts 'rebuilding EPUB'
|
data/lib/EPUBChop/chop.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
require 'nokogiri'
|
2
3
|
require 'epubinfo'
|
3
4
|
require 'tempfile'
|
@@ -38,12 +39,10 @@ module EPUBChop
|
|
38
39
|
|
39
40
|
return rebuild_epub_from_tmp_dir(extract_dir)
|
40
41
|
rescue Zip::ZipError => e
|
41
|
-
raise RuntimeError, "Error processing EPUB. #{e.message}"
|
42
|
+
raise RuntimeError, "Error processing EPUB #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace
|
42
43
|
rescue Exception => e
|
43
|
-
puts
|
44
|
-
|
45
|
-
|
46
|
-
return nil
|
44
|
+
puts e.backtrace.join("\n")
|
45
|
+
raise RuntimeError, "Chopping went wrong for #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace
|
47
46
|
ensure
|
48
47
|
FileUtils.remove_entry_secure(extract_dir)
|
49
48
|
end
|
@@ -80,50 +79,18 @@ module EPUBChop
|
|
80
79
|
|
81
80
|
else
|
82
81
|
#noinspection RubyResolve
|
83
|
-
resource = Nokogiri::
|
82
|
+
resource = Nokogiri::HTML(@book.table_of_contents.resources[filename]) do |config|
|
83
|
+
#resource = Nokogiri::HTML.parse(@book.table_of_contents.resources[filename], 'UTF-8') do |config|
|
84
84
|
config.noblanks.nonet
|
85
85
|
end
|
86
|
-
resource.
|
87
|
-
resource.css('style').remove
|
88
|
-
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
89
|
-
#resource_text_length = resource_text.length
|
90
|
-
|
91
|
-
# get a string that can be found
|
92
|
-
data = nil
|
93
|
-
window_begin = default_window_begin = 5
|
94
|
-
window_end = 0
|
95
|
-
while data.nil?
|
96
|
-
look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
|
97
|
-
|
98
|
-
if look_for.nil?
|
99
|
-
window_begin = default_window_begin += 5
|
100
|
-
window_end = 0
|
101
|
-
else
|
102
|
-
data = resource.at_css("*:contains('#{look_for.join(' ')}')")
|
103
|
-
window_begin -= 1
|
104
|
-
window_end += 1
|
105
|
-
|
106
|
-
if window_begin == window_end
|
107
|
-
window_begin = default_window_begin += 5
|
108
|
-
window_end = 0
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
86
|
+
resource.encoding = 'UTF-8'
|
112
87
|
|
113
|
-
|
114
|
-
if data
|
115
|
-
next_data = data.next_element
|
116
|
-
while next_data
|
117
|
-
in_resource = resource.css(next_data.css_path)
|
118
|
-
in_resource.remove
|
119
|
-
|
120
|
-
next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element
|
121
|
-
end
|
122
|
-
end
|
88
|
+
resource = chop_file(resource, processed_file_size)
|
123
89
|
|
124
90
|
#persist page
|
125
|
-
File.open("#{extract_dir}/#{filename}", 'w') do |f|
|
126
|
-
f.puts resource.to_xml(:save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
91
|
+
File.open("#{extract_dir}/#{filename}", 'w:UTF-8') do |f|
|
92
|
+
# f.puts resource.to_xml(:save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
93
|
+
f.puts resource.serialize(:encoding => 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::NO_DECLARATION)
|
127
94
|
end
|
128
95
|
|
129
96
|
end
|
@@ -131,6 +98,61 @@ module EPUBChop
|
|
131
98
|
end
|
132
99
|
end
|
133
100
|
|
101
|
+
def chop_file(resource, processed_file_size)
|
102
|
+
#TODO: get a better algorithm to determine where to chop
|
103
|
+
return resource if resource.nil?
|
104
|
+
|
105
|
+
resource.css('script').remove
|
106
|
+
resource.css('style').remove
|
107
|
+
resource_text = resource.at_css('body').text.split[0..processed_file_size]
|
108
|
+
|
109
|
+
# get a string that can be found
|
110
|
+
data = nil
|
111
|
+
window_begin = default_window_begin = 5
|
112
|
+
window_end = 0
|
113
|
+
while data.nil?
|
114
|
+
puts "data window:#{(processed_file_size - window_begin)}..#{(processed_file_size - window_end)}" if @verbose
|
115
|
+
processed_window_begin = processed_file_size - window_begin
|
116
|
+
processed_window_end = processed_file_size - window_end
|
117
|
+
|
118
|
+
processed_window_begin = 0 if processed_window_begin < 0
|
119
|
+
processed_window_end = processed_file_size
|
120
|
+
|
121
|
+
look_for = resource_text[processed_window_begin..processed_window_end]
|
122
|
+
|
123
|
+
if look_for.nil?
|
124
|
+
window_begin = default_window_begin += 5
|
125
|
+
window_end = 0
|
126
|
+
else
|
127
|
+
look_for.map! {|m| m.gsub("'", "\'")}
|
128
|
+
data = resource.at_css("p:contains(\"#{look_for.join(' ')}\")")
|
129
|
+
data = resource.at_css("body:contains(\"#{look_for.join(' ')}\")") if data.nil?
|
130
|
+
|
131
|
+
window_begin -= 1
|
132
|
+
window_end += 1
|
133
|
+
|
134
|
+
if window_begin == window_end
|
135
|
+
window_begin = default_window_begin += 5
|
136
|
+
window_end = 0
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
#limit on found string
|
142
|
+
if data
|
143
|
+
next_data = data.next_element
|
144
|
+
while next_data
|
145
|
+
in_resource = resource.css(next_data.css_path)
|
146
|
+
in_resource.remove
|
147
|
+
|
148
|
+
next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
resource
|
153
|
+
end
|
154
|
+
|
155
|
+
|
134
156
|
def rebuild_epub_from_tmp_dir(extract_dir)
|
135
157
|
#zip new ebook
|
136
158
|
new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
|
@@ -143,7 +165,7 @@ module EPUBChop
|
|
143
165
|
|
144
166
|
#minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
|
145
167
|
mimetype = epub_files.delete("#{extract_dir}/mimetype")
|
146
|
-
mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
|
168
|
+
mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0, 0, Zip::Entry::STORED)
|
147
169
|
zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
|
148
170
|
|
149
171
|
#all the other files
|
@@ -164,13 +186,14 @@ module EPUBChop
|
|
164
186
|
|
165
187
|
#noinspection RubyInstanceMethodNamingConvention
|
166
188
|
def remove_unused_images_from_tmp_dir(extract_dir)
|
167
|
-
puts 'removing unused media'
|
189
|
+
puts 'removing unused media' if @verbose
|
168
190
|
not_to_be_deleted_images = []
|
169
|
-
all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
|
191
|
+
all_images = @book.table_of_contents.resources.images.map { |i| i[:uri] }
|
170
192
|
@book.table_of_contents.resources.html.each do |resource|
|
171
193
|
file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
|
172
194
|
|
173
195
|
all_images.each do |image|
|
196
|
+
next if image.nil?
|
174
197
|
i = image.split('/').last
|
175
198
|
data = file.at_css("img[src$='#{i}']")
|
176
199
|
|
@@ -182,7 +205,8 @@ module EPUBChop
|
|
182
205
|
|
183
206
|
to_be_deleted_images = (all_images - not_to_be_deleted_images)
|
184
207
|
to_be_deleted_images.each do |image|
|
185
|
-
|
208
|
+
next if image.nil?
|
209
|
+
puts "\t\tremoving #{image}" if @verbose
|
186
210
|
File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
|
187
211
|
end
|
188
212
|
|
@@ -202,13 +226,14 @@ module EPUBChop
|
|
202
226
|
end
|
203
227
|
|
204
228
|
@chop_by = options[:chop_by] || :spine
|
229
|
+
@verbose = options[:verbose] || false
|
205
230
|
end
|
206
231
|
|
207
232
|
def empty_file_with_cover(filename)
|
208
233
|
number_of_subdirectories = filename.split('/').size - 1
|
209
234
|
|
210
235
|
cover_path = ''
|
211
|
-
number_of_subdirectories.times{ cover_path += '../'}
|
236
|
+
number_of_subdirectories.times { cover_path += '../' }
|
212
237
|
|
213
238
|
cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
|
214
239
|
|
@@ -233,7 +258,7 @@ module EPUBChop
|
|
233
258
|
</div>
|
234
259
|
|
235
260
|
<div style='padding-top:10px;'>
|
236
|
-
<h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : ''
|
261
|
+
<h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '')}</h3>
|
237
262
|
</div>
|
238
263
|
|
239
264
|
<div>
|
@@ -259,7 +284,7 @@ DATA
|
|
259
284
|
resource_word_count = {}
|
260
285
|
if @book
|
261
286
|
resources = @book.table_of_contents.resources.to_a
|
262
|
-
chop_by = @chop_by.eql?(:ncx)
|
287
|
+
chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
|
263
288
|
|
264
289
|
chop_by.each do |resource|
|
265
290
|
raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
|
data/lib/EPUBChop/version.rb
CHANGED
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: EPUBChop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
-
|
15
|
+
version_requirements: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.3'
|
20
|
-
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
23
21
|
requirements:
|
24
22
|
- - ~>
|
25
23
|
- !ruby/object:Gem::Version
|
26
24
|
version: '1.3'
|
25
|
+
prerelease: false
|
26
|
+
type: :development
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
29
34
|
requirement: !ruby/object:Gem::Requirement
|
30
35
|
requirements:
|
31
|
-
- -
|
36
|
+
- - '>='
|
32
37
|
- !ruby/object:Gem::Version
|
33
38
|
version: '0'
|
34
|
-
type: :development
|
35
39
|
prerelease: false
|
40
|
+
type: :development
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
36
43
|
version_requirements: !ruby/object:Gem::Requirement
|
37
44
|
requirements:
|
38
|
-
- -
|
45
|
+
- - '>='
|
39
46
|
- !ruby/object:Gem::Version
|
40
47
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rspec
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
44
49
|
requirements:
|
45
|
-
- -
|
50
|
+
- - '>='
|
46
51
|
- !ruby/object:Gem::Version
|
47
52
|
version: '0'
|
48
|
-
type: :development
|
49
53
|
prerelease: false
|
54
|
+
type: :development
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: epubinfo_with_toc
|
50
57
|
version_requirements: !ruby/object:Gem::Requirement
|
51
58
|
requirements:
|
52
|
-
- -
|
59
|
+
- - '>='
|
53
60
|
- !ruby/object:Gem::Version
|
54
61
|
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: epubinfo_with_toc
|
57
62
|
requirement: !ruby/object:Gem::Requirement
|
58
63
|
requirements:
|
59
|
-
- -
|
64
|
+
- - '>='
|
60
65
|
- !ruby/object:Gem::Version
|
61
66
|
version: '0'
|
62
|
-
type: :runtime
|
63
67
|
prerelease: false
|
64
|
-
|
65
|
-
requirements:
|
66
|
-
- - ! '>='
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
68
|
+
type: :runtime
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rubyzip
|
71
|
-
|
71
|
+
version_requirements: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ~>
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '1.0'
|
76
|
-
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
76
|
+
requirement: !ruby/object:Gem::Requirement
|
79
77
|
requirements:
|
80
78
|
- - ~>
|
81
79
|
- !ruby/object:Gem::Version
|
82
80
|
version: '1.0'
|
81
|
+
prerelease: false
|
82
|
+
type: :runtime
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: nokogiri
|
85
|
-
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - '>='
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
|
-
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
90
|
+
requirement: !ruby/object:Gem::Requirement
|
93
91
|
requirements:
|
94
|
-
- -
|
92
|
+
- - '>='
|
95
93
|
- !ruby/object:Gem::Version
|
96
94
|
version: '0'
|
95
|
+
prerelease: false
|
96
|
+
type: :runtime
|
97
97
|
description: Create EPUB previews
|
98
98
|
email:
|
99
99
|
- mehmet@celik.be
|
@@ -122,24 +122,24 @@ homepage: https://github.com/mehmetc/EPUBChop
|
|
122
122
|
licenses:
|
123
123
|
- MIT
|
124
124
|
metadata: {}
|
125
|
-
post_install_message:
|
125
|
+
post_install_message:
|
126
126
|
rdoc_options: []
|
127
127
|
require_paths:
|
128
128
|
- lib
|
129
129
|
required_ruby_version: !ruby/object:Gem::Requirement
|
130
130
|
requirements:
|
131
|
-
- -
|
131
|
+
- - '>='
|
132
132
|
- !ruby/object:Gem::Version
|
133
133
|
version: '0'
|
134
134
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- -
|
136
|
+
- - '>='
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
139
|
requirements: []
|
140
|
-
rubyforge_project:
|
141
|
-
rubygems_version: 2.
|
142
|
-
signing_key:
|
140
|
+
rubyforge_project:
|
141
|
+
rubygems_version: 2.2.2
|
142
|
+
signing_key:
|
143
143
|
specification_version: 4
|
144
144
|
summary: Removes unwanted content from an EPUB
|
145
145
|
test_files:
|