ebook_tools 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ebook_tools.gemspec +10 -9
- data/lib/ebook_tools.rb +32 -35
- data/lib/epub_book.rb +164 -0
- data/lib/paras_in_mongo.rb +1 -0
- data/lib/txt_book.rb +38 -18
- data/lib/utils.rb +15 -0
- metadata +8 -7
data/ebook_tools.gemspec
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.1.
|
5
|
+
s.version = '0.1.5'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
9
|
-
s.date = %q{2013-04
|
9
|
+
s.date = %q{2013-06-04}
|
10
10
|
s.description = %q{电子书工具集.}
|
11
|
-
s.email = %q{
|
11
|
+
s.email = %q{yalong1976@gmail.com}
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.requirements = ["none"]
|
14
14
|
s.summary = %q{电子书工具集.}
|
@@ -25,15 +25,16 @@ Gem::Specification.new do |s|
|
|
25
25
|
"bin/doc_book_import_mongo",
|
26
26
|
"bin/para_import_scheduling",
|
27
27
|
"bin/xml2json",
|
28
|
+
"lib/doc_book_in_mongo.rb",
|
28
29
|
"lib/ebook_tools.rb",
|
29
|
-
"lib/txt_book.rb",
|
30
|
-
"lib/header_detect.rb",
|
31
|
-
"lib/pdf.rb",
|
32
|
-
"lib/txt.rb",
|
33
30
|
"lib/epub.rb",
|
31
|
+
"lib/epub_book.rb",
|
32
|
+
"lib/header_detect.rb",
|
33
|
+
"lib/paras_in_mongo.rb",
|
34
|
+
"lib/pdf.rb",
|
35
|
+
"lib/txt.rb",
|
36
|
+
"lib/txt_book.rb",
|
34
37
|
"lib/utils.rb",
|
35
|
-
"lib/paras_in_mongo.rb",
|
36
|
-
"lib/doc_book_in_mongo.rb",
|
37
38
|
"workers/para_import_worker.rb",
|
38
39
|
"ebook_tools.gemspec"
|
39
40
|
]
|
data/lib/ebook_tools.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: UTF-8
|
3
|
-
['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
|
3
|
+
['utils','epub','txt','pdf','header_detect','txt_book','epub_book'].each do |file|
|
4
4
|
require File.join(File.dirname(__FILE__),file)
|
5
5
|
end
|
6
6
|
|
@@ -202,27 +202,34 @@ module EbookTools
|
|
202
202
|
|
203
203
|
def extract_book_struct_to_file(source,destination,options={})
|
204
204
|
options[:title] ||= File.basename(source,File.extname(source))
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
txt_book = TxtBook.new(content,options)
|
215
|
-
docbook_xml = txt_book.to_doc_book
|
216
|
-
if docbook_xml
|
217
|
-
FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
|
218
|
-
File.open(destination,'wb'){|file|file.write docbook_xml}
|
219
|
-
puts "目录结构:"
|
220
|
-
puts txt_book.toc_to_text
|
221
|
-
puts "共修复#{txt_book.breaklines_count}个断点."
|
222
|
-
return true
|
205
|
+
if File.extname(source) == '.epub'
|
206
|
+
epub_book = EpubBook.new(source,options)
|
207
|
+
docbook_xml = epub_book.to_doc_book
|
208
|
+
if docbook_xml
|
209
|
+
write_doc_book(destination,docbook_xml)
|
210
|
+
puts "目录结构:"
|
211
|
+
puts epub_book.toc_to_text
|
212
|
+
return true
|
213
|
+
end
|
223
214
|
else
|
224
|
-
|
215
|
+
content = case File.extname(source)
|
216
|
+
when '.html'
|
217
|
+
Utils.extract_text_from_file(source,'.html')
|
218
|
+
when '.txt'
|
219
|
+
File.open(source).read
|
220
|
+
end
|
221
|
+
txt_book = TxtBook.new(content,options)
|
222
|
+
docbook_xml = txt_book.to_doc_book
|
223
|
+
if docbook_xml
|
224
|
+
write_doc_book(destination,docbook_xml)
|
225
|
+
puts "目录结构:"
|
226
|
+
puts txt_book.toc_to_text
|
227
|
+
puts "共修复#{txt_book.breaklines_count}个断点."
|
228
|
+
return true
|
229
|
+
end
|
225
230
|
end
|
231
|
+
|
232
|
+
return nil
|
226
233
|
end
|
227
234
|
|
228
235
|
# batch_extract_from_dir
|
@@ -262,6 +269,11 @@ module EbookTools
|
|
262
269
|
end
|
263
270
|
end
|
264
271
|
|
272
|
+
def write_doc_book(destination, docbook_xml)
|
273
|
+
FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
|
274
|
+
File.open(destination,'wb'){|file|file.write docbook_xml}
|
275
|
+
end
|
276
|
+
|
265
277
|
# text_paras_repair
|
266
278
|
# 对文本文件格式中的中断段落进行修复
|
267
279
|
def text_paras_repair(source_file,target_file,options={})
|
@@ -271,21 +283,6 @@ module EbookTools
|
|
271
283
|
File.open(target_file,'w'){|file| file.write content}
|
272
284
|
end
|
273
285
|
|
274
|
-
def extract_text_from_file(filename,format)
|
275
|
-
txt_file = File.basename(filename,format)
|
276
|
-
if !filename.include?("'")
|
277
|
-
cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
|
278
|
-
elsif !filename.include?('"')
|
279
|
-
cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
|
280
|
-
else
|
281
|
-
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
282
|
-
end
|
283
|
-
output = `#{cmd}`
|
284
|
-
content = File.open("#{txt_file}.txt").read
|
285
|
-
FileUtils.remove_file("#{txt_file}.txt",true)
|
286
|
-
return content
|
287
|
-
end
|
288
|
-
|
289
286
|
# sanitize_for_epub_text
|
290
287
|
def sanitize_for_epub_text(content)
|
291
288
|
return content if content.blank?
|
data/lib/epub_book.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'uuid'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
# epub_book
|
6
|
+
# 处理EPUB书的类。
|
7
|
+
class EpubBook
|
8
|
+
attr_reader :title,:author,:publisher,:pubdate,:isbn,:content,:outline
|
9
|
+
|
10
|
+
def initialize(filename,options={})
|
11
|
+
raise '无效的文件' unless File.exists?(filename)
|
12
|
+
@title = options[:title]
|
13
|
+
@author = options[:author]
|
14
|
+
@publisher = options[:publisher]
|
15
|
+
@pubdate= options[:pubdate]
|
16
|
+
@isbn = options[:isbn]
|
17
|
+
|
18
|
+
text = Utils.extract_text_from_file(filename,'.epub')
|
19
|
+
unless Utils.detect_utf8(text)
|
20
|
+
text = Utils.to_utf8(text)
|
21
|
+
end
|
22
|
+
text = preprocess_content(text)
|
23
|
+
@outline, @content = extract_for_epub_text(text)
|
24
|
+
end
|
25
|
+
|
26
|
+
def toc_to_text
|
27
|
+
outline
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_doc_book
|
31
|
+
build_doc_book(@outline,@content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
private
|
36
|
+
def preprocess_content(content)
|
37
|
+
paras = extract_paras(content)
|
38
|
+
paras.join("\n")
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_paras(content)
|
42
|
+
paras = []
|
43
|
+
return paras if content.blank?
|
44
|
+
content.each_line do |line|
|
45
|
+
text = Utils.clean_text(line)
|
46
|
+
paras << text if text.length > 0
|
47
|
+
end
|
48
|
+
paras
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_doc_book(outline,content,options={})
|
52
|
+
doc_toc = gen_docbook_toc(outline.split("\n"))
|
53
|
+
|
54
|
+
doc_content = gen_docbook_content(content)
|
55
|
+
|
56
|
+
<<-EOS
|
57
|
+
<?xml version="1.0" encoding="utf-8"?>
|
58
|
+
<book xmlns="http://docbook.org/ns/docbook" version="5.0" id="#{UUID.generate}">
|
59
|
+
<info>
|
60
|
+
<title>#{options[:title]}</title>
|
61
|
+
<authorgroup>
|
62
|
+
<author><personname>#{options[:author]}</personname></author>
|
63
|
+
</authorgroup>
|
64
|
+
<pubdate>#{options[:pubdate]}</pubdate>
|
65
|
+
<publisher><publishername>#{options[:publisher]}</publishername></publisher>
|
66
|
+
</info>
|
67
|
+
#{doc_toc}
|
68
|
+
#{doc_content}
|
69
|
+
</book>
|
70
|
+
EOS
|
71
|
+
end
|
72
|
+
|
73
|
+
def gen_docbook_toc(toc)
|
74
|
+
"<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
|
75
|
+
end
|
76
|
+
|
77
|
+
def gen_docbook_tocdiv(toc)
|
78
|
+
toc.map do |item|
|
79
|
+
text = Utils.escape_html(Utils.clean_text(item))
|
80
|
+
"<tocdiv><title>#{item}</title></tocdiv>"
|
81
|
+
end.join("")
|
82
|
+
end
|
83
|
+
|
84
|
+
def gen_docbook_content(content)
|
85
|
+
paras = extract_paras(content)
|
86
|
+
paras_content = paras.map do |para|
|
87
|
+
text = Utils.escape_html(Utils.clean_text(para))
|
88
|
+
"<para id='#{UUID.generate}'>#{text}</para>"
|
89
|
+
end.join("\n")
|
90
|
+
"<sect1>#{paras_content}</sect1>"
|
91
|
+
end
|
92
|
+
|
93
|
+
# extract_for_epub_text
|
94
|
+
def extract_for_epub_text(content)
|
95
|
+
return content if content.blank?
|
96
|
+
if outline_type?(content)
|
97
|
+
extract_outline_with_content(content)
|
98
|
+
elsif toc_type?(content)
|
99
|
+
extract_toc_with_content(content)
|
100
|
+
else
|
101
|
+
['',content]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def extract_outline_with_content(content)
|
106
|
+
outline_flag = false
|
107
|
+
lines = []
|
108
|
+
outline = []
|
109
|
+
content.each_line do |line|
|
110
|
+
if line.strip.downcase == 'document outline'
|
111
|
+
outline_flag = true
|
112
|
+
next
|
113
|
+
end
|
114
|
+
unless outline_flag
|
115
|
+
lines << line
|
116
|
+
else
|
117
|
+
outline << line;
|
118
|
+
end
|
119
|
+
end
|
120
|
+
[outline.join("") , lines.join("")]
|
121
|
+
end
|
122
|
+
|
123
|
+
def extract_toc_with_content(content)
|
124
|
+
toc = []
|
125
|
+
lines = []
|
126
|
+
|
127
|
+
paras = extract_paras(content)
|
128
|
+
index = paras.index('Content')
|
129
|
+
paras = paras[(index+1)..-1]
|
130
|
+
|
131
|
+
point = nil
|
132
|
+
|
133
|
+
paras.each_with_index do |para, index|
|
134
|
+
if toc.include?(para)
|
135
|
+
point = index
|
136
|
+
break
|
137
|
+
else
|
138
|
+
toc << para
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
lines = paras[(point+1)..-1]
|
143
|
+
[toc.join("\n") , lines.join("\n")]
|
144
|
+
rescue
|
145
|
+
['', content]
|
146
|
+
end
|
147
|
+
|
148
|
+
def outline_type?(content)
|
149
|
+
content.each_line do |line|
|
150
|
+
return true if line.strip.downcase == 'document outline'
|
151
|
+
end
|
152
|
+
false
|
153
|
+
end
|
154
|
+
|
155
|
+
def toc_type?(content)
|
156
|
+
toc_flag = false
|
157
|
+
toc_flag1 = false
|
158
|
+
content.each_line do |line|
|
159
|
+
toc_flag = true if line.strip.downcase == '目录'
|
160
|
+
toc_flag1 = true if line.strip.downcase == 'content'
|
161
|
+
end
|
162
|
+
toc_flag && toc_flag1
|
163
|
+
end
|
164
|
+
end
|
data/lib/paras_in_mongo.rb
CHANGED
data/lib/txt_book.rb
CHANGED
@@ -87,18 +87,22 @@ class TxtBook
|
|
87
87
|
end
|
88
88
|
|
89
89
|
def toc
|
90
|
-
@toc ||= extract_toc_from_struct(struct_content)
|
90
|
+
@toc ||= extract_toc_from_struct(struct_content) if struct_content
|
91
91
|
end
|
92
92
|
|
93
93
|
def toc_to_text
|
94
|
-
|
95
|
-
|
94
|
+
if toc
|
95
|
+
gen_toc(toc) do |item,children|
|
96
|
+
"#{item[:title]}\n#{children}"
|
97
|
+
end
|
96
98
|
end
|
97
99
|
end
|
98
100
|
|
99
101
|
def to_doc_book
|
100
102
|
if struct_content
|
101
103
|
build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
104
|
+
else
|
105
|
+
build_doc_book(content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
102
106
|
end
|
103
107
|
end
|
104
108
|
|
@@ -184,12 +188,15 @@ class TxtBook
|
|
184
188
|
marked_content
|
185
189
|
end
|
186
190
|
|
187
|
-
def build_doc_book(
|
188
|
-
doc_toc = gen_docbook_toc(toc)
|
191
|
+
def build_doc_book(content,options={})
|
192
|
+
doc_toc = gen_docbook_toc(toc) if toc
|
189
193
|
|
190
|
-
|
191
|
-
|
192
|
-
|
194
|
+
doc_content = if content.is_a?(Array)
|
195
|
+
struct = content.map{|item| item if item.is_a?(Hash)}.compact
|
196
|
+
gen_docbook_content_with_struct(struct)
|
197
|
+
else
|
198
|
+
gen_docbook_content(content)
|
199
|
+
end
|
193
200
|
|
194
201
|
<<-EOS
|
195
202
|
<?xml version="1.0" encoding="utf-8"?>
|
@@ -208,6 +215,8 @@ class TxtBook
|
|
208
215
|
EOS
|
209
216
|
end
|
210
217
|
|
218
|
+
|
219
|
+
|
211
220
|
def build_struct(content)
|
212
221
|
stack = Array.new(8)
|
213
222
|
struct = []
|
@@ -314,18 +323,20 @@ EOS
|
|
314
323
|
end
|
315
324
|
|
316
325
|
def extract_toc_from_struct(struct)
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
326
|
+
if struct
|
327
|
+
toc = []
|
328
|
+
struct.each do |item|
|
329
|
+
if item.is_a?(Hash)
|
330
|
+
children = []
|
331
|
+
if item[:children].any?
|
332
|
+
children = extract_toc_from_struct(item[:children])
|
333
|
+
end
|
334
|
+
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
335
|
+
toc << item_hash
|
323
336
|
end
|
324
|
-
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
325
|
-
toc << item_hash
|
326
337
|
end
|
338
|
+
toc
|
327
339
|
end
|
328
|
-
toc
|
329
340
|
end
|
330
341
|
|
331
342
|
def gen_docbook_toc(toc)
|
@@ -350,7 +361,16 @@ EOS
|
|
350
361
|
end
|
351
362
|
end
|
352
363
|
|
353
|
-
def gen_docbook_content(
|
364
|
+
def gen_docbook_content(content)
|
365
|
+
paras = extract_paras(content)
|
366
|
+
paras_content = paras.map do |para|
|
367
|
+
text = Utils.escape_html(Utils.clean_text(para))
|
368
|
+
"<para id='#{UUID.generate}'>#{text}</para>"
|
369
|
+
end.join("\n")
|
370
|
+
"<sect1>#{paras_content}</sect1>"
|
371
|
+
end
|
372
|
+
|
373
|
+
def gen_docbook_content_with_struct(struct)
|
354
374
|
content = []
|
355
375
|
struct.each do |item|
|
356
376
|
if item.is_a?(Hash)
|
data/lib/utils.rb
CHANGED
@@ -256,4 +256,19 @@ module Utils
|
|
256
256
|
sections
|
257
257
|
end
|
258
258
|
|
259
|
+
def extract_text_from_file(filename,format)
|
260
|
+
txt_file = File.basename(filename,format)
|
261
|
+
if !filename.include?("'")
|
262
|
+
cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
|
263
|
+
elsif !filename.include?('"')
|
264
|
+
cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
|
265
|
+
else
|
266
|
+
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
267
|
+
end
|
268
|
+
output = `#{cmd}`
|
269
|
+
content = File.open("#{txt_file}.txt").read
|
270
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
271
|
+
return content
|
272
|
+
end
|
273
|
+
|
259
274
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04
|
12
|
+
date: 2013-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: uuid
|
@@ -156,7 +156,7 @@ dependencies:
|
|
156
156
|
- !ruby/object:Gem::Version
|
157
157
|
version: '0'
|
158
158
|
description: 电子书工具集.
|
159
|
-
email:
|
159
|
+
email: yalong1976@gmail.com
|
160
160
|
executables:
|
161
161
|
- ebook_tools
|
162
162
|
- para_import_mongo
|
@@ -171,15 +171,16 @@ files:
|
|
171
171
|
- bin/doc_book_import_mongo
|
172
172
|
- bin/para_import_scheduling
|
173
173
|
- bin/xml2json
|
174
|
+
- lib/doc_book_in_mongo.rb
|
174
175
|
- lib/ebook_tools.rb
|
175
|
-
- lib/
|
176
|
+
- lib/epub.rb
|
177
|
+
- lib/epub_book.rb
|
176
178
|
- lib/header_detect.rb
|
179
|
+
- lib/paras_in_mongo.rb
|
177
180
|
- lib/pdf.rb
|
178
181
|
- lib/txt.rb
|
179
|
-
- lib/
|
182
|
+
- lib/txt_book.rb
|
180
183
|
- lib/utils.rb
|
181
|
-
- lib/paras_in_mongo.rb
|
182
|
-
- lib/doc_book_in_mongo.rb
|
183
184
|
- workers/para_import_worker.rb
|
184
185
|
- ebook_tools.gemspec
|
185
186
|
homepage:
|