ebook_tools 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ # encoding: UTF-8
2
+ require 'gepub'
3
+ require 'uuid'
4
+
5
+ module EPUB
6
+ extend self
7
+ include Utils
8
+
9
+ # write_epub
10
+ # parameters:
11
+ # +epub_file+ 指定生成的epub文件
12
+ # +options+ 可选参数
13
+ # :files 指定打包到epub中的文件集合
14
+ # :title epub标题
15
+ # :author epub作者
16
+ def write_epub(epub_file,options={})
17
+ files = options[:files] || []
18
+ nav,files = extract_nav_from_files(files)
19
+ book = GEPUB::Book.new
20
+ book.set_main_id UUID.generate, {}
21
+ book.add_title options[:title]
22
+ book.version = '3.0'
23
+ book.instance_variable_get('@package').epub_backward_compat = false
24
+ book.add_creator options[:author]
25
+ book.publisher='www.nonobo.com'
26
+ book.add_item(File.basename(nav),nav,'nav').add_property('nav')
27
+ files.each do |file|
28
+ if File.extname(file) == '.html' || File.extname(file) == ".htm"
29
+ book.ordered{
30
+ book.add_item(File.basename(file),file)
31
+ }
32
+ else
33
+ book.add_item(File.basename(file),file)
34
+ end
35
+ end
36
+
37
+ Utils.make_destination_dir(epub_file)
38
+ book.generate_epub(epub_file)
39
+ end
40
+
41
+ # 生成EPUB3导航文件
42
+ def gen_nav_file(html_file,sections,options={})
43
+ temp_dir = options[:dir] || File.dirname(html_file)
44
+ nav_html = File.join(temp_dir,'nav.html')
45
+ html_content = gen_nav_file_content(html_file,sections,options={})
46
+ Utils.write_file(html_content,nav_html)
47
+ nav_html
48
+ end
49
+
50
+ def gen_nav_file_content(html_file,sections,options={})
51
+ opts = {:title => 'Table Of Contents'}.merge(options)
52
+ html_content =<<-EOS
53
+ <?xml version="1.0" encoding="utf-8"?>
54
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
55
+ <head>
56
+ <META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
57
+ </head>
58
+ <body>
59
+ <nav epub:type="toc" id="toc">
60
+ <h1>#{opts[:title]}</h1>
61
+ <ol>
62
+ #{gen_nav_items("",html_file,sections)}
63
+ </ol>
64
+ </nav>
65
+ </body>
66
+ </html>
67
+ EOS
68
+
69
+ html_content
70
+ end
71
+
72
+ private
73
+ def extract_nav_from_files(files=[])
74
+ nav = nil
75
+ files.dup.each do |f|
76
+ nav = f if File.basename(f,'.html') == 'nav'
77
+ end
78
+ files.delete_if{|file| File.basename(file,'.html') == 'nav'}
79
+ [nav,files]
80
+ end
81
+
82
+ def gen_nav_items(items,filename,sections)
83
+ item_set = []
84
+ sections.each do |section|
85
+ sub_items = ""
86
+ if section[:sub_sections]
87
+ sub_items = %Q(<ol>#{gen_sub_nav_items("",filename,section[:sub_sections])}</ol>)
88
+ end
89
+ item_set << %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
90
+ end
91
+ item_set.join("")
92
+ end
93
+
94
+ def gen_sub_nav_items(items,filename,sections)
95
+ sections.each do |section|
96
+ sub_items = ""
97
+ if section[:sub_sections]
98
+ sub_items = %Q(<ol>#{gen_sub_nav_items(items,filename,section[:sub_sections])}</ol>)
99
+ end
100
+ items = items + %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
101
+ end
102
+ items
103
+ end
104
+ end
@@ -0,0 +1,415 @@
1
+ # encoding: UTF-8
2
+ # =ExtractBookStruct
3
+ # ExtractBookStruct的目的是提取书的结构信息。
4
+ # ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
5
+ # 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
6
+ # 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
7
+ # 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
8
+ # 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
9
+ # 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
10
+ # 6. 每个结构信息都应该独立成行。
11
+ #
12
+ #
13
+ # ==接口
14
+ #
15
+ # === ExtractBookStruct.from_txt
16
+ # 从文本文件中提取目录结构
17
+ #
18
+ # === ExtractBookStruct.from_epub
19
+ # 从EPUB文件中提取目录结构
20
+ #
21
+ # === ExtractBookStruct.from_html
22
+ # 从HTML中提取目录结构
23
+
24
+ require 'uuid'
25
+ require 'cgi'
26
+ require 'iconv'
27
+
28
+ module ExtractBookStruct
29
+ extend self
30
+ extend HeaderDetect
31
+
32
+ def from_txt(filename,options={})
33
+ content = File.open(filename).read
34
+ extract_book_struct(content,options)
35
+ end
36
+
37
+ def from_html(filename,options={})
38
+ content = extract_text_from_file(filename,'.html')
39
+ extract_book_struct(content,options)
40
+ end
41
+
42
+ def from_epub(filename,options={})
43
+ content = extract_text_from_file(filename,'.epub')
44
+ extract_book_struct(content,options)
45
+ end
46
+
47
+ def extract_book_struct(content,options={})
48
+ unless Utils.detect_utf8(content)
49
+ content = Utils.to_utf8(content)
50
+ end
51
+ content = sanitize_for_epub_text(content)
52
+ paras = extract_paras(content)
53
+
54
+ # 检查书类型(text,digital,hybrid)
55
+ format = options[:format] || detect_struct_type(paras)
56
+ case format
57
+ when :text
58
+ extract_text_book_struct(paras,options)
59
+ when :digital
60
+ extract_digital_book_struct(paras,options)
61
+ when :hybrid
62
+ extract_hybrid_book_struct(paras,options)
63
+ else
64
+ return nil
65
+ end
66
+ end
67
+
68
+ def extract_text_from_file(filename,format)
69
+ txt_file = File.basename(filename,format)
70
+ cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
71
+ output = `#{cmd}`
72
+ content = File.open("#{txt_file}.txt").read
73
+ FileUtils.remove_file("#{txt_file}.txt",true)
74
+ return content
75
+ end
76
+
77
+ def extract_paras(content)
78
+ paras = []
79
+ return paras if content.blank?
80
+ content.each_line do |line|
81
+ text = Utils.clean_text(line)
82
+ paras << text if text.length > 0
83
+ end
84
+ paras
85
+ end
86
+
87
+ def detect_struct_type(paras)
88
+ text_flag = false
89
+ digital_flag = false
90
+ paras.each do |para|
91
+ if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
92
+ text_flag = true
93
+ end
94
+
95
+ if guess_digital_header?(para)
96
+ digital_flag = true
97
+ end
98
+ end
99
+
100
+ if text_flag && digital_flag
101
+ :hybrid
102
+ elsif text_flag
103
+ :text
104
+ elsif digital_flag
105
+ :digital
106
+ else
107
+ :unknown
108
+ end
109
+ end
110
+
111
+ # 从text类型书中提取结构
112
+ def extract_text_book_struct(content,options={})
113
+ # 标注结构信息
114
+ marked_content = mark_struct_info(content)
115
+
116
+ # 构建书结构
117
+ struct = build_struct(marked_content)
118
+
119
+ # 修正结构
120
+ revised_struct = revise_struct(struct)
121
+
122
+ # 生成docbook
123
+ build_doc_book(revised_struct,options)
124
+ end
125
+
126
+ # 从数字类型书中提取结构
127
+ def extract_digital_book_struct(content,options={})
128
+ marked_content = mark_digital_struct_info(content)
129
+
130
+ # 构建书结构
131
+ struct = build_struct(marked_content)
132
+
133
+ # 修正结构
134
+ revised_struct = revise_struct(struct)
135
+
136
+ # 生成docbook
137
+ build_doc_book(revised_struct,options)
138
+ end
139
+
140
+ # 从混合类型书中提取结构
141
+ def extract_hybrid_book_struct(content,options={})
142
+ marked_content = mark_hybrid_struct_info(content)
143
+
144
+ # 构建书结构
145
+ struct = build_struct(marked_content)
146
+
147
+ # 修正结构
148
+ revised_struct = revise_struct(struct)
149
+
150
+ # 生成docbook
151
+ build_doc_book(revised_struct,options)
152
+ end
153
+
154
+ # 标注结构信息
155
+ # 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
156
+ def mark_struct_info(content)
157
+ marked_content = []
158
+ content.each do |text|
159
+ if text.length > 0
160
+ type = guess_header?(text)
161
+ if type
162
+ marked_content << {:title=>text,:type=>type}
163
+ else
164
+ marked_content << text
165
+ end
166
+ end
167
+ end
168
+ marked_content
169
+ end
170
+
171
+ def mark_hybrid_struct_info(content)
172
+ marked_content = []
173
+ content.each do |text|
174
+ if text.length > 0
175
+ type = guess_header?(text)
176
+ if type
177
+ marked_content << {:title=>text,:type=>type}
178
+ else
179
+ type = guess_digital_section?(text)
180
+ if type
181
+ marked_content << {:title=>text,:type=>type}
182
+ else
183
+ marked_content << text
184
+ end
185
+ end
186
+ end
187
+ end
188
+ marked_content
189
+ end
190
+
191
+ def mark_digital_struct_info(content)
192
+ marked_content = []
193
+ content.each do |text|
194
+ if text.length > 0
195
+ type = guess_header?(text)
196
+ if type
197
+ marked_content << {:title=>text,:type=>type}
198
+ else
199
+ type = guess_digital_header?(text)
200
+ if type
201
+ marked_content << {:title=>text,:type=>type}
202
+ else
203
+ marked_content << text
204
+ end
205
+ end
206
+ end
207
+ end
208
+ marked_content
209
+ end
210
+
211
+ # 修正结构 TODO
212
+ def revise_struct(struct)
213
+ struct
214
+ end
215
+
216
+ def build_doc_book(struct,options={})
217
+ toc = extract_toc_from_struct(struct)
218
+
219
+ doc_toc = gen_docbook_toc(toc)
220
+
221
+ struct = struct.map{|item| item if item.is_a?(Hash)}.compact
222
+
223
+ doc_content = gen_docbook_content(struct)
224
+
225
+ <<-EOS
226
+ <?xml version="1.0" encoding="utf-8"?>
227
+ <book xmlns="http://docbook.org/ns/docbook" version="5.0">
228
+ <info>
229
+ <title>#{options[:title]}</title>
230
+ <author>#{options[:author]}</author>
231
+ <pubdate>#{options[:pubdate]}</pubdate>
232
+ <publisher>#{options[:publisher]}</publisher>
233
+ </info>
234
+ #{doc_toc}
235
+ #{doc_content}
236
+ </book>
237
+ EOS
238
+ end
239
+
240
+ def build_struct(content)
241
+ stack = Array.new(8)
242
+ struct = []
243
+ content.each do |line|
244
+ if line.is_a?(Hash)
245
+ case type = line[:type].to_sym
246
+ when :volume
247
+ 7.downto(0) do |index|
248
+ closed_node(struct,stack[0..index])
249
+ stack[index]=nil
250
+ end
251
+ stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
252
+ when :part
253
+ 7.downto(1) do |index|
254
+ closed_node(struct,stack[0..index])
255
+ stack[index]=nil
256
+ end
257
+ stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
258
+ when :chapter,:appendix,:index,:glossary,:preface,:afterword
259
+ 7.downto(2) do |index|
260
+ closed_node(struct,stack[0..index])
261
+ stack[index]=nil
262
+ end
263
+ stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
264
+ when :sect1
265
+ if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
266
+ stack[2][:children] << line[:title]
267
+ else
268
+ 7.downto(3) do |index|
269
+ closed_node(struct,stack[0..index])
270
+ stack[index]=nil
271
+ end
272
+ stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
273
+ end
274
+ when :sect2
275
+ 7.downto(4) do |index|
276
+ closed_node(struct,stack[0..index])
277
+ stack[index]=nil
278
+ end
279
+ stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
280
+ when :sect3
281
+ 7.downto(5) do |index|
282
+ closed_node(struct,stack[0..index])
283
+ stack[index]=nil
284
+ end
285
+ stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
286
+ when :sect4
287
+ 7.downto(6) do |index|
288
+ closed_node(struct,stack[0..index])
289
+ stack[index]=nil
290
+ end
291
+ stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
292
+ when :sect5
293
+ closed_node(struct,stack)
294
+ stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
295
+ end
296
+ else
297
+ if stack[7]
298
+ stack[7][:children] << line
299
+ elsif stack[6]
300
+ stack[6][:children] << line
301
+ elsif stack[5]
302
+ stack[5][:children] << line
303
+ elsif stack[4]
304
+ stack[4][:children] << line
305
+ elsif stack[3]
306
+ stack[3][:children] << line
307
+ elsif stack[2]
308
+ stack[2][:children] << line
309
+ elsif stack[1]
310
+ stack[1][:children] << line
311
+ elsif stack[0]
312
+ stack[0][:children] << line
313
+ else
314
+ struct << line
315
+ end
316
+ end
317
+ end
318
+
319
+ 7.downto(0) do |index|
320
+ closed_node(struct,stack[0..index])
321
+ stack[index] = nil
322
+ end
323
+
324
+ struct
325
+ end
326
+
327
+ def closed_node(struct,stack)
328
+ last = stack.pop
329
+ if last
330
+ result = false
331
+ while stack.any?
332
+ item = stack.pop
333
+ if item
334
+ item[:children] << last
335
+ result = true
336
+ break
337
+ end
338
+ end
339
+ if result == false
340
+ struct << last
341
+ end
342
+ end
343
+ end
344
+
345
+ def extract_toc_from_struct(struct)
346
+ toc = []
347
+ struct.each do |item|
348
+ if item.is_a?(Hash)
349
+ children = []
350
+ if item[:children].any?
351
+ children = extract_toc_from_struct(item[:children])
352
+ end
353
+ item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
354
+ toc << item_hash
355
+ end
356
+ end
357
+ toc
358
+ end
359
+
360
+ def gen_docbook_toc(toc)
361
+ "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
362
+ end
363
+
364
+ def gen_docbook_tocdiv(toc)
365
+ doc_toc = []
366
+ toc.each do |item|
367
+ children = ""
368
+ if item[:children].any?
369
+ children = gen_docbook_tocdiv(item[:children])
370
+ end
371
+ doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
372
+ end
373
+ doc_toc.join("")
374
+ end
375
+
376
+ def gen_docbook_content(struct)
377
+ content = []
378
+ struct.each do |item|
379
+ if item.is_a?(Hash)
380
+ children = ""
381
+ if item[:children].any?
382
+ children = gen_docbook_content(item[:children])
383
+ end
384
+ case item[:type]
385
+ when 'volume','part'
386
+ content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
387
+ when 'chapter','appendix','glossary','index','preface'
388
+ content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
389
+ when 'sect1','sect2','sect3','sect4','sect5'
390
+ content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
391
+ end
392
+ else
393
+ text = Utils.escape_html(Utils.clean_text(item))
394
+ if text.length > 0
395
+ content << "<para id='#{UUID.generate}'>#{text}</para>"
396
+ end
397
+ end
398
+ end
399
+ content.join("\n")
400
+ end
401
+
402
+ # sanitize_for_epub_text
403
+ def sanitize_for_epub_text(content)
404
+ return content if content.blank?
405
+ lines = []
406
+ content.each_line do |line|
407
+ unless line.downcase.include?('document outline')
408
+ lines << line
409
+ else
410
+ break;
411
+ end
412
+ end
413
+ lines.join("")
414
+ end
415
+ end