ebook_tools 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/README +76 -0
- data/bin/ebook_tools +196 -0
- data/ebook_tools.gemspec +38 -0
- data/lib/ebook_tools.rb +248 -0
- data/lib/epub.rb +104 -0
- data/lib/extract_book_struct.rb +415 -0
- data/lib/header_detect.rb +161 -0
- data/lib/pdf.rb +265 -0
- data/lib/txt.rb +108 -0
- data/lib/utils.rb +224 -0
- metadata +170 -0
data/lib/epub.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'gepub'
|
3
|
+
require 'uuid'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
extend self
|
7
|
+
include Utils
|
8
|
+
|
9
|
+
# write_epub
|
10
|
+
# parameters:
|
11
|
+
# +epub_file+ 指定生成的epub文件
|
12
|
+
# +options+ 可选参数
|
13
|
+
# :files 指定打包到epub中的文件集合
|
14
|
+
# :title epub标题
|
15
|
+
# :author epub作者
|
16
|
+
def write_epub(epub_file,options={})
|
17
|
+
files = options[:files] || []
|
18
|
+
nav,files = extract_nav_from_files(files)
|
19
|
+
book = GEPUB::Book.new
|
20
|
+
book.set_main_id UUID.generate, {}
|
21
|
+
book.add_title options[:title]
|
22
|
+
book.version = '3.0'
|
23
|
+
book.instance_variable_get('@package').epub_backward_compat = false
|
24
|
+
book.add_creator options[:author]
|
25
|
+
book.publisher='www.nonobo.com'
|
26
|
+
book.add_item(File.basename(nav),nav,'nav').add_property('nav')
|
27
|
+
files.each do |file|
|
28
|
+
if File.extname(file) == '.html' || File.extname(file) == ".htm"
|
29
|
+
book.ordered{
|
30
|
+
book.add_item(File.basename(file),file)
|
31
|
+
}
|
32
|
+
else
|
33
|
+
book.add_item(File.basename(file),file)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
Utils.make_destination_dir(epub_file)
|
38
|
+
book.generate_epub(epub_file)
|
39
|
+
end
|
40
|
+
|
41
|
+
# 生成EPUB3导航文件
|
42
|
+
def gen_nav_file(html_file,sections,options={})
|
43
|
+
temp_dir = options[:dir] || File.dirname(html_file)
|
44
|
+
nav_html = File.join(temp_dir,'nav.html')
|
45
|
+
html_content = gen_nav_file_content(html_file,sections,options={})
|
46
|
+
Utils.write_file(html_content,nav_html)
|
47
|
+
nav_html
|
48
|
+
end
|
49
|
+
|
50
|
+
def gen_nav_file_content(html_file,sections,options={})
|
51
|
+
opts = {:title => 'Table Of Contents'}.merge(options)
|
52
|
+
html_content =<<-EOS
|
53
|
+
<?xml version="1.0" encoding="utf-8"?>
|
54
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
55
|
+
<head>
|
56
|
+
<META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
57
|
+
</head>
|
58
|
+
<body>
|
59
|
+
<nav epub:type="toc" id="toc">
|
60
|
+
<h1>#{opts[:title]}</h1>
|
61
|
+
<ol>
|
62
|
+
#{gen_nav_items("",html_file,sections)}
|
63
|
+
</ol>
|
64
|
+
</nav>
|
65
|
+
</body>
|
66
|
+
</html>
|
67
|
+
EOS
|
68
|
+
|
69
|
+
html_content
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
def extract_nav_from_files(files=[])
|
74
|
+
nav = nil
|
75
|
+
files.dup.each do |f|
|
76
|
+
nav = f if File.basename(f,'.html') == 'nav'
|
77
|
+
end
|
78
|
+
files.delete_if{|file| File.basename(file,'.html') == 'nav'}
|
79
|
+
[nav,files]
|
80
|
+
end
|
81
|
+
|
82
|
+
def gen_nav_items(items,filename,sections)
|
83
|
+
item_set = []
|
84
|
+
sections.each do |section|
|
85
|
+
sub_items = ""
|
86
|
+
if section[:sub_sections]
|
87
|
+
sub_items = %Q(<ol>#{gen_sub_nav_items("",filename,section[:sub_sections])}</ol>)
|
88
|
+
end
|
89
|
+
item_set << %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
|
90
|
+
end
|
91
|
+
item_set.join("")
|
92
|
+
end
|
93
|
+
|
94
|
+
def gen_sub_nav_items(items,filename,sections)
|
95
|
+
sections.each do |section|
|
96
|
+
sub_items = ""
|
97
|
+
if section[:sub_sections]
|
98
|
+
sub_items = %Q(<ol>#{gen_sub_nav_items(items,filename,section[:sub_sections])}</ol>)
|
99
|
+
end
|
100
|
+
items = items + %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
|
101
|
+
end
|
102
|
+
items
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,415 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# =ExtractBookStruct
|
3
|
+
# ExtractBookStruct的目的是提取书的结构信息。
|
4
|
+
# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
|
5
|
+
# 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
|
6
|
+
# 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
|
7
|
+
# 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
|
8
|
+
# 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
|
9
|
+
# 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
|
10
|
+
# 6. 每个结构信息都应该独立成行。
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# ==接口
|
14
|
+
#
|
15
|
+
# === ExtractBookStruct.from_txt
|
16
|
+
# 从文本文件中提取目录结构
|
17
|
+
#
|
18
|
+
# === ExtractBookStruct.from_epub
|
19
|
+
# 从EPUB文件中提取目录结构
|
20
|
+
#
|
21
|
+
# === ExtractBookStruct.from_html
|
22
|
+
# 从HTML中提取目录结构
|
23
|
+
|
24
|
+
require 'uuid'
|
25
|
+
require 'cgi'
|
26
|
+
require 'iconv'
|
27
|
+
|
28
|
+
module ExtractBookStruct
|
29
|
+
extend self
|
30
|
+
extend HeaderDetect
|
31
|
+
|
32
|
+
def from_txt(filename,options={})
|
33
|
+
content = File.open(filename).read
|
34
|
+
extract_book_struct(content,options)
|
35
|
+
end
|
36
|
+
|
37
|
+
def from_html(filename,options={})
|
38
|
+
content = extract_text_from_file(filename,'.html')
|
39
|
+
extract_book_struct(content,options)
|
40
|
+
end
|
41
|
+
|
42
|
+
def from_epub(filename,options={})
|
43
|
+
content = extract_text_from_file(filename,'.epub')
|
44
|
+
extract_book_struct(content,options)
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_book_struct(content,options={})
|
48
|
+
unless Utils.detect_utf8(content)
|
49
|
+
content = Utils.to_utf8(content)
|
50
|
+
end
|
51
|
+
content = sanitize_for_epub_text(content)
|
52
|
+
paras = extract_paras(content)
|
53
|
+
|
54
|
+
# 检查书类型(text,digital,hybrid)
|
55
|
+
format = options[:format] || detect_struct_type(paras)
|
56
|
+
case format
|
57
|
+
when :text
|
58
|
+
extract_text_book_struct(paras,options)
|
59
|
+
when :digital
|
60
|
+
extract_digital_book_struct(paras,options)
|
61
|
+
when :hybrid
|
62
|
+
extract_hybrid_book_struct(paras,options)
|
63
|
+
else
|
64
|
+
return nil
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_text_from_file(filename,format)
|
69
|
+
txt_file = File.basename(filename,format)
|
70
|
+
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
71
|
+
output = `#{cmd}`
|
72
|
+
content = File.open("#{txt_file}.txt").read
|
73
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
74
|
+
return content
|
75
|
+
end
|
76
|
+
|
77
|
+
def extract_paras(content)
|
78
|
+
paras = []
|
79
|
+
return paras if content.blank?
|
80
|
+
content.each_line do |line|
|
81
|
+
text = Utils.clean_text(line)
|
82
|
+
paras << text if text.length > 0
|
83
|
+
end
|
84
|
+
paras
|
85
|
+
end
|
86
|
+
|
87
|
+
def detect_struct_type(paras)
|
88
|
+
text_flag = false
|
89
|
+
digital_flag = false
|
90
|
+
paras.each do |para|
|
91
|
+
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
92
|
+
text_flag = true
|
93
|
+
end
|
94
|
+
|
95
|
+
if guess_digital_header?(para)
|
96
|
+
digital_flag = true
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
if text_flag && digital_flag
|
101
|
+
:hybrid
|
102
|
+
elsif text_flag
|
103
|
+
:text
|
104
|
+
elsif digital_flag
|
105
|
+
:digital
|
106
|
+
else
|
107
|
+
:unknown
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# 从text类型书中提取结构
|
112
|
+
def extract_text_book_struct(content,options={})
|
113
|
+
# 标注结构信息
|
114
|
+
marked_content = mark_struct_info(content)
|
115
|
+
|
116
|
+
# 构建书结构
|
117
|
+
struct = build_struct(marked_content)
|
118
|
+
|
119
|
+
# 修正结构
|
120
|
+
revised_struct = revise_struct(struct)
|
121
|
+
|
122
|
+
# 生成docbook
|
123
|
+
build_doc_book(revised_struct,options)
|
124
|
+
end
|
125
|
+
|
126
|
+
# 从数字类型书中提取结构
|
127
|
+
def extract_digital_book_struct(content,options={})
|
128
|
+
marked_content = mark_digital_struct_info(content)
|
129
|
+
|
130
|
+
# 构建书结构
|
131
|
+
struct = build_struct(marked_content)
|
132
|
+
|
133
|
+
# 修正结构
|
134
|
+
revised_struct = revise_struct(struct)
|
135
|
+
|
136
|
+
# 生成docbook
|
137
|
+
build_doc_book(revised_struct,options)
|
138
|
+
end
|
139
|
+
|
140
|
+
# 从混合类型书中提取结构
|
141
|
+
def extract_hybrid_book_struct(content,options={})
|
142
|
+
marked_content = mark_hybrid_struct_info(content)
|
143
|
+
|
144
|
+
# 构建书结构
|
145
|
+
struct = build_struct(marked_content)
|
146
|
+
|
147
|
+
# 修正结构
|
148
|
+
revised_struct = revise_struct(struct)
|
149
|
+
|
150
|
+
# 生成docbook
|
151
|
+
build_doc_book(revised_struct,options)
|
152
|
+
end
|
153
|
+
|
154
|
+
# 标注结构信息
|
155
|
+
# 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
|
156
|
+
def mark_struct_info(content)
|
157
|
+
marked_content = []
|
158
|
+
content.each do |text|
|
159
|
+
if text.length > 0
|
160
|
+
type = guess_header?(text)
|
161
|
+
if type
|
162
|
+
marked_content << {:title=>text,:type=>type}
|
163
|
+
else
|
164
|
+
marked_content << text
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
marked_content
|
169
|
+
end
|
170
|
+
|
171
|
+
def mark_hybrid_struct_info(content)
|
172
|
+
marked_content = []
|
173
|
+
content.each do |text|
|
174
|
+
if text.length > 0
|
175
|
+
type = guess_header?(text)
|
176
|
+
if type
|
177
|
+
marked_content << {:title=>text,:type=>type}
|
178
|
+
else
|
179
|
+
type = guess_digital_section?(text)
|
180
|
+
if type
|
181
|
+
marked_content << {:title=>text,:type=>type}
|
182
|
+
else
|
183
|
+
marked_content << text
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
marked_content
|
189
|
+
end
|
190
|
+
|
191
|
+
def mark_digital_struct_info(content)
|
192
|
+
marked_content = []
|
193
|
+
content.each do |text|
|
194
|
+
if text.length > 0
|
195
|
+
type = guess_header?(text)
|
196
|
+
if type
|
197
|
+
marked_content << {:title=>text,:type=>type}
|
198
|
+
else
|
199
|
+
type = guess_digital_header?(text)
|
200
|
+
if type
|
201
|
+
marked_content << {:title=>text,:type=>type}
|
202
|
+
else
|
203
|
+
marked_content << text
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
marked_content
|
209
|
+
end
|
210
|
+
|
211
|
+
# 修正结构 TODO
|
212
|
+
def revise_struct(struct)
|
213
|
+
struct
|
214
|
+
end
|
215
|
+
|
216
|
+
def build_doc_book(struct,options={})
|
217
|
+
toc = extract_toc_from_struct(struct)
|
218
|
+
|
219
|
+
doc_toc = gen_docbook_toc(toc)
|
220
|
+
|
221
|
+
struct = struct.map{|item| item if item.is_a?(Hash)}.compact
|
222
|
+
|
223
|
+
doc_content = gen_docbook_content(struct)
|
224
|
+
|
225
|
+
<<-EOS
|
226
|
+
<?xml version="1.0" encoding="utf-8"?>
|
227
|
+
<book xmlns="http://docbook.org/ns/docbook" version="5.0">
|
228
|
+
<info>
|
229
|
+
<title>#{options[:title]}</title>
|
230
|
+
<author>#{options[:author]}</author>
|
231
|
+
<pubdate>#{options[:pubdate]}</pubdate>
|
232
|
+
<publisher>#{options[:publisher]}</publisher>
|
233
|
+
</info>
|
234
|
+
#{doc_toc}
|
235
|
+
#{doc_content}
|
236
|
+
</book>
|
237
|
+
EOS
|
238
|
+
end
|
239
|
+
|
240
|
+
def build_struct(content)
|
241
|
+
stack = Array.new(8)
|
242
|
+
struct = []
|
243
|
+
content.each do |line|
|
244
|
+
if line.is_a?(Hash)
|
245
|
+
case type = line[:type].to_sym
|
246
|
+
when :volume
|
247
|
+
7.downto(0) do |index|
|
248
|
+
closed_node(struct,stack[0..index])
|
249
|
+
stack[index]=nil
|
250
|
+
end
|
251
|
+
stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
252
|
+
when :part
|
253
|
+
7.downto(1) do |index|
|
254
|
+
closed_node(struct,stack[0..index])
|
255
|
+
stack[index]=nil
|
256
|
+
end
|
257
|
+
stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
258
|
+
when :chapter,:appendix,:index,:glossary,:preface,:afterword
|
259
|
+
7.downto(2) do |index|
|
260
|
+
closed_node(struct,stack[0..index])
|
261
|
+
stack[index]=nil
|
262
|
+
end
|
263
|
+
stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
264
|
+
when :sect1
|
265
|
+
if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
|
266
|
+
stack[2][:children] << line[:title]
|
267
|
+
else
|
268
|
+
7.downto(3) do |index|
|
269
|
+
closed_node(struct,stack[0..index])
|
270
|
+
stack[index]=nil
|
271
|
+
end
|
272
|
+
stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
273
|
+
end
|
274
|
+
when :sect2
|
275
|
+
7.downto(4) do |index|
|
276
|
+
closed_node(struct,stack[0..index])
|
277
|
+
stack[index]=nil
|
278
|
+
end
|
279
|
+
stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
280
|
+
when :sect3
|
281
|
+
7.downto(5) do |index|
|
282
|
+
closed_node(struct,stack[0..index])
|
283
|
+
stack[index]=nil
|
284
|
+
end
|
285
|
+
stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
286
|
+
when :sect4
|
287
|
+
7.downto(6) do |index|
|
288
|
+
closed_node(struct,stack[0..index])
|
289
|
+
stack[index]=nil
|
290
|
+
end
|
291
|
+
stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
292
|
+
when :sect5
|
293
|
+
closed_node(struct,stack)
|
294
|
+
stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
295
|
+
end
|
296
|
+
else
|
297
|
+
if stack[7]
|
298
|
+
stack[7][:children] << line
|
299
|
+
elsif stack[6]
|
300
|
+
stack[6][:children] << line
|
301
|
+
elsif stack[5]
|
302
|
+
stack[5][:children] << line
|
303
|
+
elsif stack[4]
|
304
|
+
stack[4][:children] << line
|
305
|
+
elsif stack[3]
|
306
|
+
stack[3][:children] << line
|
307
|
+
elsif stack[2]
|
308
|
+
stack[2][:children] << line
|
309
|
+
elsif stack[1]
|
310
|
+
stack[1][:children] << line
|
311
|
+
elsif stack[0]
|
312
|
+
stack[0][:children] << line
|
313
|
+
else
|
314
|
+
struct << line
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
7.downto(0) do |index|
|
320
|
+
closed_node(struct,stack[0..index])
|
321
|
+
stack[index] = nil
|
322
|
+
end
|
323
|
+
|
324
|
+
struct
|
325
|
+
end
|
326
|
+
|
327
|
+
def closed_node(struct,stack)
|
328
|
+
last = stack.pop
|
329
|
+
if last
|
330
|
+
result = false
|
331
|
+
while stack.any?
|
332
|
+
item = stack.pop
|
333
|
+
if item
|
334
|
+
item[:children] << last
|
335
|
+
result = true
|
336
|
+
break
|
337
|
+
end
|
338
|
+
end
|
339
|
+
if result == false
|
340
|
+
struct << last
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
def extract_toc_from_struct(struct)
|
346
|
+
toc = []
|
347
|
+
struct.each do |item|
|
348
|
+
if item.is_a?(Hash)
|
349
|
+
children = []
|
350
|
+
if item[:children].any?
|
351
|
+
children = extract_toc_from_struct(item[:children])
|
352
|
+
end
|
353
|
+
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
354
|
+
toc << item_hash
|
355
|
+
end
|
356
|
+
end
|
357
|
+
toc
|
358
|
+
end
|
359
|
+
|
360
|
+
def gen_docbook_toc(toc)
|
361
|
+
"<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
|
362
|
+
end
|
363
|
+
|
364
|
+
def gen_docbook_tocdiv(toc)
|
365
|
+
doc_toc = []
|
366
|
+
toc.each do |item|
|
367
|
+
children = ""
|
368
|
+
if item[:children].any?
|
369
|
+
children = gen_docbook_tocdiv(item[:children])
|
370
|
+
end
|
371
|
+
doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
|
372
|
+
end
|
373
|
+
doc_toc.join("")
|
374
|
+
end
|
375
|
+
|
376
|
+
def gen_docbook_content(struct)
|
377
|
+
content = []
|
378
|
+
struct.each do |item|
|
379
|
+
if item.is_a?(Hash)
|
380
|
+
children = ""
|
381
|
+
if item[:children].any?
|
382
|
+
children = gen_docbook_content(item[:children])
|
383
|
+
end
|
384
|
+
case item[:type]
|
385
|
+
when 'volume','part'
|
386
|
+
content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
|
387
|
+
when 'chapter','appendix','glossary','index','preface'
|
388
|
+
content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
|
389
|
+
when 'sect1','sect2','sect3','sect4','sect5'
|
390
|
+
content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
|
391
|
+
end
|
392
|
+
else
|
393
|
+
text = Utils.escape_html(Utils.clean_text(item))
|
394
|
+
if text.length > 0
|
395
|
+
content << "<para id='#{UUID.generate}'>#{text}</para>"
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
399
|
+
content.join("\n")
|
400
|
+
end
|
401
|
+
|
402
|
+
# sanitize_for_epub_text
|
403
|
+
def sanitize_for_epub_text(content)
|
404
|
+
return content if content.blank?
|
405
|
+
lines = []
|
406
|
+
content.each_line do |line|
|
407
|
+
unless line.downcase.include?('document outline')
|
408
|
+
lines << line
|
409
|
+
else
|
410
|
+
break;
|
411
|
+
end
|
412
|
+
end
|
413
|
+
lines.join("")
|
414
|
+
end
|
415
|
+
end
|