ebook_tools 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,104 @@
1
+ # encoding: UTF-8
2
+ require 'gepub'
3
+ require 'uuid'
4
+
5
+ module EPUB
6
+ extend self
7
+ include Utils
8
+
9
+ # write_epub
10
+ # parameters:
11
+ # +epub_file+ 指定生成的epub文件
12
+ # +options+ 可选参数
13
+ # :files 指定打包到epub中的文件集合
14
+ # :title epub标题
15
+ # :author epub作者
16
+ def write_epub(epub_file,options={})
17
+ files = options[:files] || []
18
+ nav,files = extract_nav_from_files(files)
19
+ book = GEPUB::Book.new
20
+ book.set_main_id UUID.generate, {}
21
+ book.add_title options[:title]
22
+ book.version = '3.0'
23
+ book.instance_variable_get('@package').epub_backward_compat = false
24
+ book.add_creator options[:author]
25
+ book.publisher='www.nonobo.com'
26
+ book.add_item(File.basename(nav),nav,'nav').add_property('nav')
27
+ files.each do |file|
28
+ if File.extname(file) == '.html' || File.extname(file) == ".htm"
29
+ book.ordered{
30
+ book.add_item(File.basename(file),file)
31
+ }
32
+ else
33
+ book.add_item(File.basename(file),file)
34
+ end
35
+ end
36
+
37
+ Utils.make_destination_dir(epub_file)
38
+ book.generate_epub(epub_file)
39
+ end
40
+
41
+ # 生成EPUB3导航文件
42
+ def gen_nav_file(html_file,sections,options={})
43
+ temp_dir = options[:dir] || File.dirname(html_file)
44
+ nav_html = File.join(temp_dir,'nav.html')
45
+ html_content = gen_nav_file_content(html_file,sections,options={})
46
+ Utils.write_file(html_content,nav_html)
47
+ nav_html
48
+ end
49
+
50
+ def gen_nav_file_content(html_file,sections,options={})
51
+ opts = {:title => 'Table Of Contents'}.merge(options)
52
+ html_content =<<-EOS
53
+ <?xml version="1.0" encoding="utf-8"?>
54
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
55
+ <head>
56
+ <META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
57
+ </head>
58
+ <body>
59
+ <nav epub:type="toc" id="toc">
60
+ <h1>#{opts[:title]}</h1>
61
+ <ol>
62
+ #{gen_nav_items("",html_file,sections)}
63
+ </ol>
64
+ </nav>
65
+ </body>
66
+ </html>
67
+ EOS
68
+
69
+ html_content
70
+ end
71
+
72
+ private
73
+ def extract_nav_from_files(files=[])
74
+ nav = nil
75
+ files.dup.each do |f|
76
+ nav = f if File.basename(f,'.html') == 'nav'
77
+ end
78
+ files.delete_if{|file| File.basename(file,'.html') == 'nav'}
79
+ [nav,files]
80
+ end
81
+
82
+ def gen_nav_items(items,filename,sections)
83
+ item_set = []
84
+ sections.each do |section|
85
+ sub_items = ""
86
+ if section[:sub_sections]
87
+ sub_items = %Q(<ol>#{gen_sub_nav_items("",filename,section[:sub_sections])}</ol>)
88
+ end
89
+ item_set << %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
90
+ end
91
+ item_set.join("")
92
+ end
93
+
94
+ def gen_sub_nav_items(items,filename,sections)
95
+ sections.each do |section|
96
+ sub_items = ""
97
+ if section[:sub_sections]
98
+ sub_items = %Q(<ol>#{gen_sub_nav_items(items,filename,section[:sub_sections])}</ol>)
99
+ end
100
+ items = items + %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
101
+ end
102
+ items
103
+ end
104
+ end
@@ -0,0 +1,415 @@
1
+ # encoding: UTF-8
2
+ # =ExtractBookStruct
3
+ # ExtractBookStruct的目的是提取书的结构信息。
4
+ # ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
5
+ # 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
6
+ # 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
7
+ # 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
8
+ # 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
9
+ # 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
10
+ # 6. 每个结构信息都应该独立成行。
11
+ #
12
+ #
13
+ # ==接口
14
+ #
15
+ # === ExtractBookStruct.from_txt
16
+ # 从文本文件中提取目录结构
17
+ #
18
+ # === ExtractBookStruct.from_epub
19
+ # 从EPUB文件中提取目录结构
20
+ #
21
+ # === ExtractBookStruct.from_html
22
+ # 从HTML中提取目录结构
23
+
24
+ require 'uuid'
25
+ require 'cgi'
26
+ require 'iconv'
27
+
28
+ module ExtractBookStruct
29
+ extend self
30
+ extend HeaderDetect
31
+
32
+ def from_txt(filename,options={})
33
+ content = File.open(filename).read
34
+ extract_book_struct(content,options)
35
+ end
36
+
37
+ def from_html(filename,options={})
38
+ content = extract_text_from_file(filename,'.html')
39
+ extract_book_struct(content,options)
40
+ end
41
+
42
+ def from_epub(filename,options={})
43
+ content = extract_text_from_file(filename,'.epub')
44
+ extract_book_struct(content,options)
45
+ end
46
+
47
+ def extract_book_struct(content,options={})
48
+ unless Utils.detect_utf8(content)
49
+ content = Utils.to_utf8(content)
50
+ end
51
+ content = sanitize_for_epub_text(content)
52
+ paras = extract_paras(content)
53
+
54
+ # 检查书类型(text,digital,hybrid)
55
+ format = options[:format] || detect_struct_type(paras)
56
+ case format
57
+ when :text
58
+ extract_text_book_struct(paras,options)
59
+ when :digital
60
+ extract_digital_book_struct(paras,options)
61
+ when :hybrid
62
+ extract_hybrid_book_struct(paras,options)
63
+ else
64
+ return nil
65
+ end
66
+ end
67
+
68
+ def extract_text_from_file(filename,format)
69
+ txt_file = File.basename(filename,format)
70
+ cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
71
+ output = `#{cmd}`
72
+ content = File.open("#{txt_file}.txt").read
73
+ FileUtils.remove_file("#{txt_file}.txt",true)
74
+ return content
75
+ end
76
+
77
+ def extract_paras(content)
78
+ paras = []
79
+ return paras if content.blank?
80
+ content.each_line do |line|
81
+ text = Utils.clean_text(line)
82
+ paras << text if text.length > 0
83
+ end
84
+ paras
85
+ end
86
+
87
+ def detect_struct_type(paras)
88
+ text_flag = false
89
+ digital_flag = false
90
+ paras.each do |para|
91
+ if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
92
+ text_flag = true
93
+ end
94
+
95
+ if guess_digital_header?(para)
96
+ digital_flag = true
97
+ end
98
+ end
99
+
100
+ if text_flag && digital_flag
101
+ :hybrid
102
+ elsif text_flag
103
+ :text
104
+ elsif digital_flag
105
+ :digital
106
+ else
107
+ :unknown
108
+ end
109
+ end
110
+
111
+ # 从text类型书中提取结构
112
+ def extract_text_book_struct(content,options={})
113
+ # 标注结构信息
114
+ marked_content = mark_struct_info(content)
115
+
116
+ # 构建书结构
117
+ struct = build_struct(marked_content)
118
+
119
+ # 修正结构
120
+ revised_struct = revise_struct(struct)
121
+
122
+ # 生成docbook
123
+ build_doc_book(revised_struct,options)
124
+ end
125
+
126
+ # 从数字类型书中提取结构
127
+ def extract_digital_book_struct(content,options={})
128
+ marked_content = mark_digital_struct_info(content)
129
+
130
+ # 构建书结构
131
+ struct = build_struct(marked_content)
132
+
133
+ # 修正结构
134
+ revised_struct = revise_struct(struct)
135
+
136
+ # 生成docbook
137
+ build_doc_book(revised_struct,options)
138
+ end
139
+
140
+ # 从混合类型书中提取结构
141
+ def extract_hybrid_book_struct(content,options={})
142
+ marked_content = mark_hybrid_struct_info(content)
143
+
144
+ # 构建书结构
145
+ struct = build_struct(marked_content)
146
+
147
+ # 修正结构
148
+ revised_struct = revise_struct(struct)
149
+
150
+ # 生成docbook
151
+ build_doc_book(revised_struct,options)
152
+ end
153
+
154
+ # 标注结构信息
155
+ # 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
156
+ def mark_struct_info(content)
157
+ marked_content = []
158
+ content.each do |text|
159
+ if text.length > 0
160
+ type = guess_header?(text)
161
+ if type
162
+ marked_content << {:title=>text,:type=>type}
163
+ else
164
+ marked_content << text
165
+ end
166
+ end
167
+ end
168
+ marked_content
169
+ end
170
+
171
+ def mark_hybrid_struct_info(content)
172
+ marked_content = []
173
+ content.each do |text|
174
+ if text.length > 0
175
+ type = guess_header?(text)
176
+ if type
177
+ marked_content << {:title=>text,:type=>type}
178
+ else
179
+ type = guess_digital_section?(text)
180
+ if type
181
+ marked_content << {:title=>text,:type=>type}
182
+ else
183
+ marked_content << text
184
+ end
185
+ end
186
+ end
187
+ end
188
+ marked_content
189
+ end
190
+
191
+ def mark_digital_struct_info(content)
192
+ marked_content = []
193
+ content.each do |text|
194
+ if text.length > 0
195
+ type = guess_header?(text)
196
+ if type
197
+ marked_content << {:title=>text,:type=>type}
198
+ else
199
+ type = guess_digital_header?(text)
200
+ if type
201
+ marked_content << {:title=>text,:type=>type}
202
+ else
203
+ marked_content << text
204
+ end
205
+ end
206
+ end
207
+ end
208
+ marked_content
209
+ end
210
+
211
+ # 修正结构 TODO
212
+ def revise_struct(struct)
213
+ struct
214
+ end
215
+
216
+ def build_doc_book(struct,options={})
217
+ toc = extract_toc_from_struct(struct)
218
+
219
+ doc_toc = gen_docbook_toc(toc)
220
+
221
+ struct = struct.map{|item| item if item.is_a?(Hash)}.compact
222
+
223
+ doc_content = gen_docbook_content(struct)
224
+
225
+ <<-EOS
226
+ <?xml version="1.0" encoding="utf-8"?>
227
+ <book xmlns="http://docbook.org/ns/docbook" version="5.0">
228
+ <info>
229
+ <title>#{options[:title]}</title>
230
+ <author>#{options[:author]}</author>
231
+ <pubdate>#{options[:pubdate]}</pubdate>
232
+ <publisher>#{options[:publisher]}</publisher>
233
+ </info>
234
+ #{doc_toc}
235
+ #{doc_content}
236
+ </book>
237
+ EOS
238
+ end
239
+
240
+ def build_struct(content)
241
+ stack = Array.new(8)
242
+ struct = []
243
+ content.each do |line|
244
+ if line.is_a?(Hash)
245
+ case type = line[:type].to_sym
246
+ when :volume
247
+ 7.downto(0) do |index|
248
+ closed_node(struct,stack[0..index])
249
+ stack[index]=nil
250
+ end
251
+ stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
252
+ when :part
253
+ 7.downto(1) do |index|
254
+ closed_node(struct,stack[0..index])
255
+ stack[index]=nil
256
+ end
257
+ stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
258
+ when :chapter,:appendix,:index,:glossary,:preface,:afterword
259
+ 7.downto(2) do |index|
260
+ closed_node(struct,stack[0..index])
261
+ stack[index]=nil
262
+ end
263
+ stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
264
+ when :sect1
265
+ if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
266
+ stack[2][:children] << line[:title]
267
+ else
268
+ 7.downto(3) do |index|
269
+ closed_node(struct,stack[0..index])
270
+ stack[index]=nil
271
+ end
272
+ stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
273
+ end
274
+ when :sect2
275
+ 7.downto(4) do |index|
276
+ closed_node(struct,stack[0..index])
277
+ stack[index]=nil
278
+ end
279
+ stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
280
+ when :sect3
281
+ 7.downto(5) do |index|
282
+ closed_node(struct,stack[0..index])
283
+ stack[index]=nil
284
+ end
285
+ stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
286
+ when :sect4
287
+ 7.downto(6) do |index|
288
+ closed_node(struct,stack[0..index])
289
+ stack[index]=nil
290
+ end
291
+ stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
292
+ when :sect5
293
+ closed_node(struct,stack)
294
+ stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
295
+ end
296
+ else
297
+ if stack[7]
298
+ stack[7][:children] << line
299
+ elsif stack[6]
300
+ stack[6][:children] << line
301
+ elsif stack[5]
302
+ stack[5][:children] << line
303
+ elsif stack[4]
304
+ stack[4][:children] << line
305
+ elsif stack[3]
306
+ stack[3][:children] << line
307
+ elsif stack[2]
308
+ stack[2][:children] << line
309
+ elsif stack[1]
310
+ stack[1][:children] << line
311
+ elsif stack[0]
312
+ stack[0][:children] << line
313
+ else
314
+ struct << line
315
+ end
316
+ end
317
+ end
318
+
319
+ 7.downto(0) do |index|
320
+ closed_node(struct,stack[0..index])
321
+ stack[index] = nil
322
+ end
323
+
324
+ struct
325
+ end
326
+
327
+ def closed_node(struct,stack)
328
+ last = stack.pop
329
+ if last
330
+ result = false
331
+ while stack.any?
332
+ item = stack.pop
333
+ if item
334
+ item[:children] << last
335
+ result = true
336
+ break
337
+ end
338
+ end
339
+ if result == false
340
+ struct << last
341
+ end
342
+ end
343
+ end
344
+
345
+ def extract_toc_from_struct(struct)
346
+ toc = []
347
+ struct.each do |item|
348
+ if item.is_a?(Hash)
349
+ children = []
350
+ if item[:children].any?
351
+ children = extract_toc_from_struct(item[:children])
352
+ end
353
+ item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
354
+ toc << item_hash
355
+ end
356
+ end
357
+ toc
358
+ end
359
+
360
+ def gen_docbook_toc(toc)
361
+ "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
362
+ end
363
+
364
+ def gen_docbook_tocdiv(toc)
365
+ doc_toc = []
366
+ toc.each do |item|
367
+ children = ""
368
+ if item[:children].any?
369
+ children = gen_docbook_tocdiv(item[:children])
370
+ end
371
+ doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
372
+ end
373
+ doc_toc.join("")
374
+ end
375
+
376
+ def gen_docbook_content(struct)
377
+ content = []
378
+ struct.each do |item|
379
+ if item.is_a?(Hash)
380
+ children = ""
381
+ if item[:children].any?
382
+ children = gen_docbook_content(item[:children])
383
+ end
384
+ case item[:type]
385
+ when 'volume','part'
386
+ content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
387
+ when 'chapter','appendix','glossary','index','preface'
388
+ content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
389
+ when 'sect1','sect2','sect3','sect4','sect5'
390
+ content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
391
+ end
392
+ else
393
+ text = Utils.escape_html(Utils.clean_text(item))
394
+ if text.length > 0
395
+ content << "<para id='#{UUID.generate}'>#{text}</para>"
396
+ end
397
+ end
398
+ end
399
+ content.join("\n")
400
+ end
401
+
402
+ # sanitize_for_epub_text
403
+ def sanitize_for_epub_text(content)
404
+ return content if content.blank?
405
+ lines = []
406
+ content.each_line do |line|
407
+ unless line.downcase.include?('document outline')
408
+ lines << line
409
+ else
410
+ break;
411
+ end
412
+ end
413
+ lines.join("")
414
+ end
415
+ end