ebook_tools 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/README +76 -0
- data/bin/ebook_tools +196 -0
- data/ebook_tools.gemspec +38 -0
- data/lib/ebook_tools.rb +248 -0
- data/lib/epub.rb +104 -0
- data/lib/extract_book_struct.rb +415 -0
- data/lib/header_detect.rb +161 -0
- data/lib/pdf.rb +265 -0
- data/lib/txt.rb +108 -0
- data/lib/utils.rb +224 -0
- metadata +170 -0
data/lib/epub.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'gepub'
|
3
|
+
require 'uuid'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
extend self
|
7
|
+
include Utils
|
8
|
+
|
9
|
+
# write_epub
|
10
|
+
# parameters:
|
11
|
+
# +epub_file+ 指定生成的epub文件
|
12
|
+
# +options+ 可选参数
|
13
|
+
# :files 指定打包到epub中的文件集合
|
14
|
+
# :title epub标题
|
15
|
+
# :author epub作者
|
16
|
+
def write_epub(epub_file,options={})
|
17
|
+
files = options[:files] || []
|
18
|
+
nav,files = extract_nav_from_files(files)
|
19
|
+
book = GEPUB::Book.new
|
20
|
+
book.set_main_id UUID.generate, {}
|
21
|
+
book.add_title options[:title]
|
22
|
+
book.version = '3.0'
|
23
|
+
book.instance_variable_get('@package').epub_backward_compat = false
|
24
|
+
book.add_creator options[:author]
|
25
|
+
book.publisher='www.nonobo.com'
|
26
|
+
book.add_item(File.basename(nav),nav,'nav').add_property('nav')
|
27
|
+
files.each do |file|
|
28
|
+
if File.extname(file) == '.html' || File.extname(file) == ".htm"
|
29
|
+
book.ordered{
|
30
|
+
book.add_item(File.basename(file),file)
|
31
|
+
}
|
32
|
+
else
|
33
|
+
book.add_item(File.basename(file),file)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
Utils.make_destination_dir(epub_file)
|
38
|
+
book.generate_epub(epub_file)
|
39
|
+
end
|
40
|
+
|
41
|
+
# 生成EPUB3导航文件
|
42
|
+
def gen_nav_file(html_file,sections,options={})
|
43
|
+
temp_dir = options[:dir] || File.dirname(html_file)
|
44
|
+
nav_html = File.join(temp_dir,'nav.html')
|
45
|
+
html_content = gen_nav_file_content(html_file,sections,options={})
|
46
|
+
Utils.write_file(html_content,nav_html)
|
47
|
+
nav_html
|
48
|
+
end
|
49
|
+
|
50
|
+
def gen_nav_file_content(html_file,sections,options={})
|
51
|
+
opts = {:title => 'Table Of Contents'}.merge(options)
|
52
|
+
html_content =<<-EOS
|
53
|
+
<?xml version="1.0" encoding="utf-8"?>
|
54
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
55
|
+
<head>
|
56
|
+
<META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
57
|
+
</head>
|
58
|
+
<body>
|
59
|
+
<nav epub:type="toc" id="toc">
|
60
|
+
<h1>#{opts[:title]}</h1>
|
61
|
+
<ol>
|
62
|
+
#{gen_nav_items("",html_file,sections)}
|
63
|
+
</ol>
|
64
|
+
</nav>
|
65
|
+
</body>
|
66
|
+
</html>
|
67
|
+
EOS
|
68
|
+
|
69
|
+
html_content
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
def extract_nav_from_files(files=[])
|
74
|
+
nav = nil
|
75
|
+
files.dup.each do |f|
|
76
|
+
nav = f if File.basename(f,'.html') == 'nav'
|
77
|
+
end
|
78
|
+
files.delete_if{|file| File.basename(file,'.html') == 'nav'}
|
79
|
+
[nav,files]
|
80
|
+
end
|
81
|
+
|
82
|
+
def gen_nav_items(items,filename,sections)
|
83
|
+
item_set = []
|
84
|
+
sections.each do |section|
|
85
|
+
sub_items = ""
|
86
|
+
if section[:sub_sections]
|
87
|
+
sub_items = %Q(<ol>#{gen_sub_nav_items("",filename,section[:sub_sections])}</ol>)
|
88
|
+
end
|
89
|
+
item_set << %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
|
90
|
+
end
|
91
|
+
item_set.join("")
|
92
|
+
end
|
93
|
+
|
94
|
+
def gen_sub_nav_items(items,filename,sections)
|
95
|
+
sections.each do |section|
|
96
|
+
sub_items = ""
|
97
|
+
if section[:sub_sections]
|
98
|
+
sub_items = %Q(<ol>#{gen_sub_nav_items(items,filename,section[:sub_sections])}</ol>)
|
99
|
+
end
|
100
|
+
items = items + %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
|
101
|
+
end
|
102
|
+
items
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,415 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# =ExtractBookStruct
|
3
|
+
# ExtractBookStruct的目的是提取书的结构信息。
|
4
|
+
# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
|
5
|
+
# 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
|
6
|
+
# 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
|
7
|
+
# 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
|
8
|
+
# 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
|
9
|
+
# 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
|
10
|
+
# 6. 每个结构信息都应该独立成行。
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# ==接口
|
14
|
+
#
|
15
|
+
# === ExtractBookStruct.from_txt
|
16
|
+
# 从文本文件中提取目录结构
|
17
|
+
#
|
18
|
+
# === ExtractBookStruct.from_epub
|
19
|
+
# 从EPUB文件中提取目录结构
|
20
|
+
#
|
21
|
+
# === ExtractBookStruct.from_html
|
22
|
+
# 从HTML中提取目录结构
|
23
|
+
|
24
|
+
require 'uuid'
|
25
|
+
require 'cgi'
|
26
|
+
require 'iconv'
|
27
|
+
|
28
|
+
module ExtractBookStruct
|
29
|
+
extend self
|
30
|
+
extend HeaderDetect
|
31
|
+
|
32
|
+
def from_txt(filename,options={})
|
33
|
+
content = File.open(filename).read
|
34
|
+
extract_book_struct(content,options)
|
35
|
+
end
|
36
|
+
|
37
|
+
def from_html(filename,options={})
|
38
|
+
content = extract_text_from_file(filename,'.html')
|
39
|
+
extract_book_struct(content,options)
|
40
|
+
end
|
41
|
+
|
42
|
+
def from_epub(filename,options={})
|
43
|
+
content = extract_text_from_file(filename,'.epub')
|
44
|
+
extract_book_struct(content,options)
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_book_struct(content,options={})
|
48
|
+
unless Utils.detect_utf8(content)
|
49
|
+
content = Utils.to_utf8(content)
|
50
|
+
end
|
51
|
+
content = sanitize_for_epub_text(content)
|
52
|
+
paras = extract_paras(content)
|
53
|
+
|
54
|
+
# 检查书类型(text,digital,hybrid)
|
55
|
+
format = options[:format] || detect_struct_type(paras)
|
56
|
+
case format
|
57
|
+
when :text
|
58
|
+
extract_text_book_struct(paras,options)
|
59
|
+
when :digital
|
60
|
+
extract_digital_book_struct(paras,options)
|
61
|
+
when :hybrid
|
62
|
+
extract_hybrid_book_struct(paras,options)
|
63
|
+
else
|
64
|
+
return nil
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_text_from_file(filename,format)
|
69
|
+
txt_file = File.basename(filename,format)
|
70
|
+
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
71
|
+
output = `#{cmd}`
|
72
|
+
content = File.open("#{txt_file}.txt").read
|
73
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
74
|
+
return content
|
75
|
+
end
|
76
|
+
|
77
|
+
def extract_paras(content)
|
78
|
+
paras = []
|
79
|
+
return paras if content.blank?
|
80
|
+
content.each_line do |line|
|
81
|
+
text = Utils.clean_text(line)
|
82
|
+
paras << text if text.length > 0
|
83
|
+
end
|
84
|
+
paras
|
85
|
+
end
|
86
|
+
|
87
|
+
def detect_struct_type(paras)
|
88
|
+
text_flag = false
|
89
|
+
digital_flag = false
|
90
|
+
paras.each do |para|
|
91
|
+
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
92
|
+
text_flag = true
|
93
|
+
end
|
94
|
+
|
95
|
+
if guess_digital_header?(para)
|
96
|
+
digital_flag = true
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
if text_flag && digital_flag
|
101
|
+
:hybrid
|
102
|
+
elsif text_flag
|
103
|
+
:text
|
104
|
+
elsif digital_flag
|
105
|
+
:digital
|
106
|
+
else
|
107
|
+
:unknown
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# 从text类型书中提取结构
|
112
|
+
def extract_text_book_struct(content,options={})
|
113
|
+
# 标注结构信息
|
114
|
+
marked_content = mark_struct_info(content)
|
115
|
+
|
116
|
+
# 构建书结构
|
117
|
+
struct = build_struct(marked_content)
|
118
|
+
|
119
|
+
# 修正结构
|
120
|
+
revised_struct = revise_struct(struct)
|
121
|
+
|
122
|
+
# 生成docbook
|
123
|
+
build_doc_book(revised_struct,options)
|
124
|
+
end
|
125
|
+
|
126
|
+
# 从数字类型书中提取结构
|
127
|
+
def extract_digital_book_struct(content,options={})
|
128
|
+
marked_content = mark_digital_struct_info(content)
|
129
|
+
|
130
|
+
# 构建书结构
|
131
|
+
struct = build_struct(marked_content)
|
132
|
+
|
133
|
+
# 修正结构
|
134
|
+
revised_struct = revise_struct(struct)
|
135
|
+
|
136
|
+
# 生成docbook
|
137
|
+
build_doc_book(revised_struct,options)
|
138
|
+
end
|
139
|
+
|
140
|
+
# 从混合类型书中提取结构
|
141
|
+
def extract_hybrid_book_struct(content,options={})
|
142
|
+
marked_content = mark_hybrid_struct_info(content)
|
143
|
+
|
144
|
+
# 构建书结构
|
145
|
+
struct = build_struct(marked_content)
|
146
|
+
|
147
|
+
# 修正结构
|
148
|
+
revised_struct = revise_struct(struct)
|
149
|
+
|
150
|
+
# 生成docbook
|
151
|
+
build_doc_book(revised_struct,options)
|
152
|
+
end
|
153
|
+
|
154
|
+
# 标注结构信息
|
155
|
+
# 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
|
156
|
+
def mark_struct_info(content)
|
157
|
+
marked_content = []
|
158
|
+
content.each do |text|
|
159
|
+
if text.length > 0
|
160
|
+
type = guess_header?(text)
|
161
|
+
if type
|
162
|
+
marked_content << {:title=>text,:type=>type}
|
163
|
+
else
|
164
|
+
marked_content << text
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
marked_content
|
169
|
+
end
|
170
|
+
|
171
|
+
def mark_hybrid_struct_info(content)
|
172
|
+
marked_content = []
|
173
|
+
content.each do |text|
|
174
|
+
if text.length > 0
|
175
|
+
type = guess_header?(text)
|
176
|
+
if type
|
177
|
+
marked_content << {:title=>text,:type=>type}
|
178
|
+
else
|
179
|
+
type = guess_digital_section?(text)
|
180
|
+
if type
|
181
|
+
marked_content << {:title=>text,:type=>type}
|
182
|
+
else
|
183
|
+
marked_content << text
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
marked_content
|
189
|
+
end
|
190
|
+
|
191
|
+
def mark_digital_struct_info(content)
|
192
|
+
marked_content = []
|
193
|
+
content.each do |text|
|
194
|
+
if text.length > 0
|
195
|
+
type = guess_header?(text)
|
196
|
+
if type
|
197
|
+
marked_content << {:title=>text,:type=>type}
|
198
|
+
else
|
199
|
+
type = guess_digital_header?(text)
|
200
|
+
if type
|
201
|
+
marked_content << {:title=>text,:type=>type}
|
202
|
+
else
|
203
|
+
marked_content << text
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
marked_content
|
209
|
+
end
|
210
|
+
|
211
|
+
# 修正结构 TODO
|
212
|
+
def revise_struct(struct)
|
213
|
+
struct
|
214
|
+
end
|
215
|
+
|
216
|
+
def build_doc_book(struct,options={})
|
217
|
+
toc = extract_toc_from_struct(struct)
|
218
|
+
|
219
|
+
doc_toc = gen_docbook_toc(toc)
|
220
|
+
|
221
|
+
struct = struct.map{|item| item if item.is_a?(Hash)}.compact
|
222
|
+
|
223
|
+
doc_content = gen_docbook_content(struct)
|
224
|
+
|
225
|
+
<<-EOS
|
226
|
+
<?xml version="1.0" encoding="utf-8"?>
|
227
|
+
<book xmlns="http://docbook.org/ns/docbook" version="5.0">
|
228
|
+
<info>
|
229
|
+
<title>#{options[:title]}</title>
|
230
|
+
<author>#{options[:author]}</author>
|
231
|
+
<pubdate>#{options[:pubdate]}</pubdate>
|
232
|
+
<publisher>#{options[:publisher]}</publisher>
|
233
|
+
</info>
|
234
|
+
#{doc_toc}
|
235
|
+
#{doc_content}
|
236
|
+
</book>
|
237
|
+
EOS
|
238
|
+
end
|
239
|
+
|
240
|
+
def build_struct(content)
|
241
|
+
stack = Array.new(8)
|
242
|
+
struct = []
|
243
|
+
content.each do |line|
|
244
|
+
if line.is_a?(Hash)
|
245
|
+
case type = line[:type].to_sym
|
246
|
+
when :volume
|
247
|
+
7.downto(0) do |index|
|
248
|
+
closed_node(struct,stack[0..index])
|
249
|
+
stack[index]=nil
|
250
|
+
end
|
251
|
+
stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
252
|
+
when :part
|
253
|
+
7.downto(1) do |index|
|
254
|
+
closed_node(struct,stack[0..index])
|
255
|
+
stack[index]=nil
|
256
|
+
end
|
257
|
+
stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
258
|
+
when :chapter,:appendix,:index,:glossary,:preface,:afterword
|
259
|
+
7.downto(2) do |index|
|
260
|
+
closed_node(struct,stack[0..index])
|
261
|
+
stack[index]=nil
|
262
|
+
end
|
263
|
+
stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
264
|
+
when :sect1
|
265
|
+
if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
|
266
|
+
stack[2][:children] << line[:title]
|
267
|
+
else
|
268
|
+
7.downto(3) do |index|
|
269
|
+
closed_node(struct,stack[0..index])
|
270
|
+
stack[index]=nil
|
271
|
+
end
|
272
|
+
stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
273
|
+
end
|
274
|
+
when :sect2
|
275
|
+
7.downto(4) do |index|
|
276
|
+
closed_node(struct,stack[0..index])
|
277
|
+
stack[index]=nil
|
278
|
+
end
|
279
|
+
stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
280
|
+
when :sect3
|
281
|
+
7.downto(5) do |index|
|
282
|
+
closed_node(struct,stack[0..index])
|
283
|
+
stack[index]=nil
|
284
|
+
end
|
285
|
+
stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
286
|
+
when :sect4
|
287
|
+
7.downto(6) do |index|
|
288
|
+
closed_node(struct,stack[0..index])
|
289
|
+
stack[index]=nil
|
290
|
+
end
|
291
|
+
stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
292
|
+
when :sect5
|
293
|
+
closed_node(struct,stack)
|
294
|
+
stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
295
|
+
end
|
296
|
+
else
|
297
|
+
if stack[7]
|
298
|
+
stack[7][:children] << line
|
299
|
+
elsif stack[6]
|
300
|
+
stack[6][:children] << line
|
301
|
+
elsif stack[5]
|
302
|
+
stack[5][:children] << line
|
303
|
+
elsif stack[4]
|
304
|
+
stack[4][:children] << line
|
305
|
+
elsif stack[3]
|
306
|
+
stack[3][:children] << line
|
307
|
+
elsif stack[2]
|
308
|
+
stack[2][:children] << line
|
309
|
+
elsif stack[1]
|
310
|
+
stack[1][:children] << line
|
311
|
+
elsif stack[0]
|
312
|
+
stack[0][:children] << line
|
313
|
+
else
|
314
|
+
struct << line
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
7.downto(0) do |index|
|
320
|
+
closed_node(struct,stack[0..index])
|
321
|
+
stack[index] = nil
|
322
|
+
end
|
323
|
+
|
324
|
+
struct
|
325
|
+
end
|
326
|
+
|
327
|
+
def closed_node(struct,stack)
|
328
|
+
last = stack.pop
|
329
|
+
if last
|
330
|
+
result = false
|
331
|
+
while stack.any?
|
332
|
+
item = stack.pop
|
333
|
+
if item
|
334
|
+
item[:children] << last
|
335
|
+
result = true
|
336
|
+
break
|
337
|
+
end
|
338
|
+
end
|
339
|
+
if result == false
|
340
|
+
struct << last
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
def extract_toc_from_struct(struct)
|
346
|
+
toc = []
|
347
|
+
struct.each do |item|
|
348
|
+
if item.is_a?(Hash)
|
349
|
+
children = []
|
350
|
+
if item[:children].any?
|
351
|
+
children = extract_toc_from_struct(item[:children])
|
352
|
+
end
|
353
|
+
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
354
|
+
toc << item_hash
|
355
|
+
end
|
356
|
+
end
|
357
|
+
toc
|
358
|
+
end
|
359
|
+
|
360
|
+
def gen_docbook_toc(toc)
|
361
|
+
"<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
|
362
|
+
end
|
363
|
+
|
364
|
+
def gen_docbook_tocdiv(toc)
|
365
|
+
doc_toc = []
|
366
|
+
toc.each do |item|
|
367
|
+
children = ""
|
368
|
+
if item[:children].any?
|
369
|
+
children = gen_docbook_tocdiv(item[:children])
|
370
|
+
end
|
371
|
+
doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
|
372
|
+
end
|
373
|
+
doc_toc.join("")
|
374
|
+
end
|
375
|
+
|
376
|
+
def gen_docbook_content(struct)
|
377
|
+
content = []
|
378
|
+
struct.each do |item|
|
379
|
+
if item.is_a?(Hash)
|
380
|
+
children = ""
|
381
|
+
if item[:children].any?
|
382
|
+
children = gen_docbook_content(item[:children])
|
383
|
+
end
|
384
|
+
case item[:type]
|
385
|
+
when 'volume','part'
|
386
|
+
content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
|
387
|
+
when 'chapter','appendix','glossary','index','preface'
|
388
|
+
content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
|
389
|
+
when 'sect1','sect2','sect3','sect4','sect5'
|
390
|
+
content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
|
391
|
+
end
|
392
|
+
else
|
393
|
+
text = Utils.escape_html(Utils.clean_text(item))
|
394
|
+
if text.length > 0
|
395
|
+
content << "<para id='#{UUID.generate}'>#{text}</para>"
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
399
|
+
content.join("\n")
|
400
|
+
end
|
401
|
+
|
402
|
+
# sanitize_for_epub_text
|
403
|
+
def sanitize_for_epub_text(content)
|
404
|
+
return content if content.blank?
|
405
|
+
lines = []
|
406
|
+
content.each_line do |line|
|
407
|
+
unless line.downcase.include?('document outline')
|
408
|
+
lines << line
|
409
|
+
else
|
410
|
+
break;
|
411
|
+
end
|
412
|
+
end
|
413
|
+
lines.join("")
|
414
|
+
end
|
415
|
+
end
|