extract_book_struct 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,6 @@
1
+ 0.0.2 2013.3.28
2
+ * fix GB2312格式的文本文件提取错误
3
+ * 新增batch_extract_book_struct命令行工具
4
+
5
+ 0.0.1 2013.3.28
6
+ init release
data/README ADDED
@@ -0,0 +1,76 @@
1
+ # encoding: UTF-8
2
+ # = ExtractBookStruct
3
+ # ExtractBookStruct的目的是从各类电子书内容中提取书的结构信息。目前支持txt,epub,html。
4
+ # ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
5
+ # 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
6
+ # 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
7
+ # 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
8
+ # 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
9
+ # 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
10
+ # 6. 每个结构信息都应该独立成行。
11
+ #
12
+ # 文档结构信息分析
13
+ # 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
14
+ # 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
15
+ # 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
16
+ # 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
17
+ # 根据不同的类型,对结构信息的提取采用不同的处理手段。
18
+ #
19
+ # 有效的标题信息应该符合以下规则:
20
+ # 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
21
+ # 2. 应该包含结构信息表述,具体如下:
22
+ # 文本描述:
23
+ # 卷: 以"第xxx卷"开始
24
+ # 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
25
+ # 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
26
+ # 部分(篇): 以"第xxx部"或"第xxx篇"开始
27
+ # 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
28
+ # 章(回): 以"第xxx章"或"第xxx回"开始
29
+ # 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
30
+ # 节: 以"第xxx节"开始
31
+ # 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
32
+ # 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
33
+ # 单个"序"
34
+ # 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
35
+ # "preface"
36
+ # "foreword"
37
+ # 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
38
+ # 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
39
+ # 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
40
+ # "index"
41
+ # 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
42
+ # 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
43
+ # 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
44
+ # "appendix"
45
+ # 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
46
+ # 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
47
+ # 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
48
+ # "glossary"
49
+ # 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
50
+ #
51
+ # 数字描述:
52
+ # 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
53
+ #
54
+ # ==API接口
55
+ #
56
+ # === ExtractBookStruct.from_txt
57
+ # 从文本文件中提取目录结构,使用示例:
58
+ # ExtractBookStruct.from_txt('1.txt',{:title=>'title',:author=>'author'})
59
+ #
60
+ # === ExtractBookStruct.from_epub
61
+ # 从EPUB文件中提取目录结构,使用示例:
62
+ # ExtractBookStruct.from_epub('1.epub',{:title=>'title',:author=>'author'})
63
+ #
64
+ # === ExtractBookStruct.from_html
65
+ # 从HTML中提取目录结构,使用示例:
66
+ # ExtractBookStruct.from_html('1.html',{:title=>'title',:author=>'author'})
67
+ #
68
+ # == 命令行工具
69
+ # extract_book_struct,使用示例:
70
+ # extract_book_struct '1.txt', '1.xml'
71
+ #
72
+ # == 依赖
73
+ # ExtractBookStruct依赖以下工具和包:
74
+ # ebook-convert: calibre cli tools.
75
+ # uuid: ruby gem.
76
+ # iconv: ruby gem.
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ require 'rubygems'
4
+ require 'optparse'
5
+ require File.join(File.expand_path('../../',__FILE__),'lib','batch_extract')
6
+
7
+ def help
8
+ puts <<-EOF
9
+ extract_book_struct: 批量提取书结构信息
10
+ usage:
11
+ batch_extract_book_struct [options] source_dir destination_dir
12
+
13
+ source_dir: 指定需要提取结构信息的书所在目录
14
+ destination_dir: 指定提取的书结构信息所输出的文件目录
15
+
16
+ options:
17
+ -F,--format 指定要提取书的格式
18
+
19
+ 适用对象要求:
20
+ 1. 编码格式为utf-8
21
+ EOF
22
+ exit
23
+ end
24
+
25
+ options = {}
26
+ opts = OptionParser.new do |opts|
27
+
28
+ opts.on('-F format','--format format','format') do |format|
29
+ options[:format] = format
30
+ end
31
+
32
+ opts.on('-h','--help') do
33
+ help
34
+ exit
35
+ end
36
+ end
37
+ opts.parse ARGV
38
+
39
+ source_dir = ARGV[-2]
40
+ destination_dir = ARGV[-1]
41
+
42
+ if source_dir.nil? || destination_dir.nil?
43
+ help
44
+ exit
45
+ end
46
+
47
+ unless File.directory?(source_dir)
48
+ puts "error: source_dir #{source_dir} not is directory"
49
+ else
50
+ begin
51
+ FileUtils.mkdir_p(destination_dir) unless Dir.exists?(destination_dir)
52
+ rescue
53
+ puts "error: destination_dir #{destination_dir} not created"
54
+ exit
55
+ end
56
+
57
+ BatchExtract.batch_extract_from_dir(source_dir,destination_dir,options)
58
+ end
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ require 'rubygems'
4
+ require 'optparse'
5
+ require File.join(File.expand_path('../../',__FILE__),'lib','extract_book_struct')
6
+
7
+ def help
8
+ puts <<-EOF
9
+ extract_book_struct: 提取书结构信息
10
+ usage:
11
+ extract_book_struct [options] source_file docbook_file
12
+
13
+ source_file: 指定需要提取结构信息的书文件
14
+ docbook_file: 指定提取的书结构信息所输出的文件
15
+
16
+ options:
17
+ -T <title>, --title <title> : 书的标题
18
+ -A <author>, --author <author> : 书作者
19
+ --pubdate <pubdate> : 出版时间
20
+ --publisher <publisher> : 出版社
21
+
22
+ 适用对象要求:
23
+ 1. 编码格式为utf-8
24
+ EOF
25
+ exit
26
+ end
27
+
28
+ options = {}
29
+ opts = OptionParser.new do |opts|
30
+
31
+ opts.on('-T title','--title title','title') do |title|
32
+ options[:title] = title
33
+ end
34
+
35
+ opts.on('-A author','--author author','author') do |author|
36
+ options[:author] = author
37
+ end
38
+
39
+ opts.on('--publisher publisher','publisher') do |publisher|
40
+ options[:publisher] = publisher
41
+ end
42
+
43
+ opts.on('--pubdate pubdate','pubdate') do |pubdate|
44
+ options[:pubdate] = pubdate
45
+ end
46
+
47
+ opts.on('-h','--help') do
48
+ help
49
+ exit
50
+ end
51
+ end
52
+ opts.parse ARGV
53
+
54
+ source_file = ARGV[-2]
55
+ docbook_file = ARGV[-1]
56
+
57
+ if source_file.nil? || docbook_file.nil?
58
+ help
59
+ exit
60
+ end
61
+
62
+ unless File.exists?(source_file)
63
+ puts "error: source_file #{source_file} no found"
64
+ else
65
+ begin
66
+ dest_path = File.dirname(docbook_file)
67
+ FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
68
+ rescue
69
+ puts "error: docbook_file #{docbook_file} not created"
70
+ exit
71
+ end
72
+ ext_name = File.extname(source_file).downcase
73
+ options[:title] ||= File.basename(source_file,ext_name)
74
+ unless ['.html','.txt','.epub'].include?(ext_name)
75
+ puts "source_file不是允许的文件格式: txt,html,epub"
76
+ exit
77
+ end
78
+
79
+ begin
80
+ docbook_xml = case ext_name
81
+ when '.html'
82
+ ExtractBookStruct.from_html(source_file,options)
83
+ when '.txt'
84
+ ExtractBookStruct.from_txt(source_file,options)
85
+ when '.epub'
86
+ ExtractBookStruct.from_epub(source_file,options)
87
+ end
88
+ if docbook_xml
89
+ File.open(docbook_file,'wb'){|file|file.write docbook_xml}
90
+ puts "success: extract book struct successfully!"
91
+ end
92
+ rescue => e
93
+ puts "error: #{source_file} \n#{e.backtrace.join("\n")}"
94
+ end
95
+ end
@@ -0,0 +1,76 @@
1
+ # encoding: UTF-8
2
+ require 'pathname'
3
+ require File.join(File.expand_path('../../',__FILE__),'lib','extract_book_struct')
4
+
5
+ module BatchExtract
6
+ extend self
7
+ # batch_extract_from_dir
8
+ # batch extract book struct form dir
9
+ # parameters:
10
+ # +source+ source directory
11
+ # +destination+ output directory
12
+ # +options+ optional parameter.
13
+ # :format 指定需要提取结构的文件后缀名,例如要从所有txt文件中提取,通过:format=>'.txt'指定
14
+ def batch_extract_from_dir(source,destination,options={})
15
+ format = options.delete(:format)
16
+ files = scan_file_from_dir(source,{:format=>format})
17
+
18
+ files.each do |file|
19
+ extname = File.extname(file)
20
+ basename = File.basename(file,extname)
21
+ dest_file = File.join(File.dirname(File.join(destination,file.gsub(source,''))),"#{basename}.xml")
22
+ puts "start extract #{file} ..."
23
+ begin
24
+ docbook_xml = case extname
25
+ when '.html'
26
+ ExtractBookStruct.from_html(file,options)
27
+ when '.txt'
28
+ ExtractBookStruct.from_txt(file,options)
29
+ when '.epub'
30
+ ExtractBookStruct.from_epub(file,options)
31
+ else
32
+ nil
33
+ end
34
+ if docbook_xml
35
+ File.open(dest_file,'wb'){|file|file.write docbook_xml}
36
+ puts "success: extract book struct successfully!"
37
+ end
38
+ #rescue => e
39
+ # puts "error: #{file} \n#{e.backtrace.join("\n")}"
40
+ end
41
+
42
+ end
43
+ end
44
+
45
+ # scan_file_from_dir
46
+ # 遍历目录下的文件
47
+ # parameters:
48
+ # +dir+ 需遍历的目录
49
+ # +options+ 可选参数
50
+ # :format 指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定
51
+ def scan_file_from_dir(dir,options={})
52
+ files = []
53
+ walk_dir(dir,options) do |file|
54
+ files << file.to_s
55
+ end
56
+ files
57
+ end
58
+
59
+ def walk_dir(path_str,options={})
60
+ path = Pathname.new(path_str)
61
+ format = options[:format]
62
+ path.children.each do |entry|
63
+ if entry.directory?
64
+ walk_dir(entry) {|x| yield(x)}
65
+ elsif entry.file?
66
+ if format
67
+ if entry.extname == format
68
+ yield entry
69
+ end
70
+ else
71
+ yield entry
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,594 @@
1
+ # encoding: UTF-8
2
+ # =ExtractBookStruct
3
+ # ExtractBookStruct的目的是提取书的结构信息。
4
+ # ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
5
+ # 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
6
+ # 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
7
+ # 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
8
+ # 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
9
+ # 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
10
+ # 6. 每个结构信息都应该独立成行。
11
+ #
12
+ # 文档结构信息分析
13
+ # 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
14
+ # 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
15
+ # 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
16
+ # 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
17
+ # 根据不同的类型,对结构信息的提取采用不同的处理手段。
18
+ #
19
+ # 有效的标题信息应该符合以下规则:
20
+ # 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
21
+ # 2. 应该包含结构信息表述,具体如下:
22
+ # 文本描述:
23
+ # 卷: 以"第xxx卷"开始
24
+ # 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
25
+ # 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
26
+ # 部分(篇): 以"第xxx部"或"第xxx篇"开始
27
+ # 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
28
+ # 章(回): 以"第xxx章"或"第xxx回"开始
29
+ # 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
30
+ # 节: 以"第xxx节"开始
31
+ # 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
32
+ # 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
33
+ # 单个"序"
34
+ # 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
35
+ # "preface"
36
+ # "foreword"
37
+ # 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
38
+ # 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
39
+ # 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
40
+ # "index"
41
+ # 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
42
+ # 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
43
+ # 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
44
+ # "appendix"
45
+ # 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
46
+ # 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
47
+ # 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
48
+ # "glossary"
49
+ # 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
50
+ #
51
+ # 数字描述:
52
+ # 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
53
+ #
54
+ # ==接口
55
+ #
56
+ # === ExtractBookStruct.from_txt
57
+ # 从文本文件中提取目录结构
58
+ #
59
+ # === ExtractBookStruct.from_epub
60
+ # 从EPUB文件中提取目录结构
61
+ #
62
+ # === ExtractBookStruct.from_html
63
+ # 从HTML中提取目录结构
64
+
65
+ require 'uuid'
66
+ require 'cgi'
67
+ require 'iconv'
68
+
69
+ module ExtractBookStruct
70
+ extend self
71
+ def from_txt(filename,options={})
72
+ content = File.open(filename).read
73
+ unless detect_utf8(content)
74
+ content = to_utf8(content)
75
+ end
76
+ content = sanitize_for_epub_text(content)
77
+ paras = extract_paras(content)
78
+ extract_book_struct(paras,options)
79
+ end
80
+
81
+ def from_html(filename,options={})
82
+ content = extract_text_from_file(filename,'.html')
83
+ content = to_utf8(content) unless detect_utf8(content)
84
+ paras = extract_paras(content)
85
+ extract_book_struct(paras,options)
86
+ end
87
+
88
+ def from_epub(filename,options={})
89
+ content = extract_text_from_file(filename,'.epub')
90
+ content = to_utf8(content) unless detect_utf8(content)
91
+ paras = extract_paras(content)
92
+ extract_book_struct(paras,options)
93
+ end
94
+
95
+ def extract_book_struct(paras,options={})
96
+ # 检查书类型(text,digital,hybrid)
97
+ format = options[:format] || detect_struct_type(paras)
98
+ case format
99
+ when :text
100
+ extract_text_book_struct(paras,options)
101
+ when :digital
102
+ extract_digital_book_struct(paras,options)
103
+ when :hybrid
104
+ extract_hybrid_book_struct(paras,options)
105
+ else
106
+ puts "警告: 没有检测到书结构信息."
107
+ return nil
108
+ end
109
+ end
110
+
111
+ def extract_text_from_file(filename,format)
112
+ txt_file = File.basename(filename,format)
113
+ cmd = "ebook-convert #{filename} #{txt_file}.txt"
114
+ output = `#{cmd}`
115
+ content = File.open("#{txt_file}.txt").read
116
+ FileUtils.remove_file("#{txt_file}.txt",true)
117
+ sanitize_for_epub_text(content)
118
+ end
119
+
120
+ def extract_paras(content)
121
+ paras = []
122
+ content.each_line do |line|
123
+ text = clean_text(line)
124
+ paras << text if text.length > 0
125
+ end
126
+ paras
127
+ end
128
+
129
+ def detect_struct_type(paras)
130
+ text_flag = false
131
+ digital_flag = false
132
+ paras.each do |para|
133
+ if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
134
+ text_flag = true
135
+ end
136
+
137
+ if guess_digital_head_line?(para)
138
+ digital_flag = true
139
+ end
140
+ end
141
+
142
+ if text_flag && digital_flag
143
+ :hybrid
144
+ elsif text_flag
145
+ :text
146
+ elsif digital_flag
147
+ :digital
148
+ else
149
+ :unknown
150
+ end
151
+ end
152
+
153
+ # 从text类型书中提取结构
154
+ def extract_text_book_struct(content,options={})
155
+ # 标注结构信息
156
+ marked_content = mark_struct_info(content)
157
+
158
+ # 构建书结构
159
+ struct = build_struct(marked_content)
160
+
161
+ # 修正结构
162
+ revised_struct = revise_struct(struct)
163
+
164
+ # 生成docbook
165
+ build_doc_book(revised_struct,options)
166
+ end
167
+
168
+ # 从数字类型书中提取结构
169
+ def extract_digital_book_struct(content,options={})
170
+ marked_content = mark_digital_struct_info(content)
171
+
172
+ # 构建书结构
173
+ struct = build_struct(marked_content)
174
+
175
+ # 修正结构
176
+ revised_struct = revise_struct(struct)
177
+
178
+ # 生成docbook
179
+ build_doc_book(revised_struct,options)
180
+ end
181
+
182
+ # 从混合类型书中提取结构
183
+ def extract_hybrid_book_struct(content,options={})
184
+ marked_content = mark_hybrid_struct_info(content)
185
+
186
+ # 构建书结构
187
+ struct = build_struct(marked_content)
188
+
189
+ # 修正结构
190
+ revised_struct = revise_struct(struct)
191
+
192
+ # 生成docbook
193
+ build_doc_book(revised_struct,options)
194
+ end
195
+
196
+ # 标注结构信息
197
+ # 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
198
+ def mark_struct_info(content)
199
+ marked_content = []
200
+ content.each do |text|
201
+ if text.length > 0
202
+ type = guess_head_line?(text)
203
+ if type
204
+ marked_content << {:title=>text,:type=>type}
205
+ else
206
+ marked_content << text
207
+ end
208
+ end
209
+ end
210
+ marked_content
211
+ end
212
+
213
+ def mark_hybrid_struct_info(content)
214
+ marked_content = []
215
+ content.each do |text|
216
+ if text.length > 0
217
+ type = guess_head_line?(text)
218
+ if type
219
+ marked_content << {:title=>text,:type=>type}
220
+ else
221
+ type = guess_digital_section?(text)
222
+ if type
223
+ marked_content << {:title=>text,:type=>type}
224
+ else
225
+ marked_content << text
226
+ end
227
+ end
228
+ end
229
+ end
230
+ marked_content
231
+ end
232
+
233
+ def mark_digital_struct_info(content)
234
+ marked_content = []
235
+ content.each do |text|
236
+ if text.length > 0
237
+ type = guess_head_line?(text)
238
+ if type
239
+ marked_content << {:title=>text,:type=>type}
240
+ else
241
+ type = guess_digital_head_line?(text)
242
+ if type
243
+ marked_content << {:title=>text,:type=>type}
244
+ else
245
+ marked_content << text
246
+ end
247
+ end
248
+ end
249
+ end
250
+ marked_content
251
+ end
252
+
253
+ # 修正结构 TODO
254
+ def revise_struct(struct)
255
+ struct
256
+ end
257
+
258
+ def build_doc_book(struct,options={})
259
+ toc = extract_toc_from_struct(struct)
260
+
261
+ doc_toc = gen_docbook_toc(toc)
262
+
263
+ struct = struct.map{|item| item if item.is_a?(Hash)}.compact
264
+
265
+ doc_content = gen_docbook_content(struct)
266
+
267
+ <<-EOS
268
+ <?xml version="1.0" encoding="utf-8"?>
269
+ <book xmlns="http://docbook.org/ns/docbook" version="5.0">
270
+ <info>
271
+ <title>#{options[:title]}</title>
272
+ <author>#{options[:author]}</author>
273
+ <pubdate>#{options[:pubdate]}</pubdate>
274
+ <publisher>#{options[:publisher]}</publisher>
275
+ </info>
276
+ #{doc_toc}
277
+ #{doc_content}
278
+ </book>
279
+ EOS
280
+ end
281
+
282
+ def guess_volume?(text,options={})
283
+ return false if hav_complete_sentence?(text)
284
+ return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
285
+ text = text.downcase
286
+ return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
287
+ end
288
+
289
+ def guess_part?(text,options={})
290
+ return false if hav_complete_sentence?(text)
291
+ return true if text =~ /^第.{1,3}[部篇]/
292
+ text = text.downcase
293
+ return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
294
+ end
295
+
296
+ def guess_chapter?(text)
297
+ return false if hav_complete_sentence?(text)
298
+ return true if text =~ /^第.{1,4}[章回]/
299
+ text = text.downcase
300
+ return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
301
+ end
302
+
303
+ def guess_section?(text)
304
+ return false if hav_complete_sentence?(text)
305
+ return true if text =~ /^第.{1,3}[节]/
306
+ end
307
+
308
+ def guess_preface?(text)
309
+ return false if hav_complete_sentence?(text)
310
+ return true if text =~ /^前\s*言$/
311
+ return true if text =~ /^序\s*言$/
312
+ return true if text =~ /^序$/
313
+ return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
314
+ text = text.downcase
315
+ return true if text =~ /^preface$/
316
+ return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
317
+ return true if text =~ /^foreword$/
318
+ return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
319
+ end
320
+
321
+ def guess_index?(text)
322
+ return false if hav_complete_sentence?(text)
323
+ return true if text =~ /^索\s*引$/
324
+ return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
325
+ text = text.downcase
326
+ return true if text =~ /^index$/
327
+ return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
328
+ end
329
+
330
+ def guess_appendix?(text)
331
+ return false if hav_complete_sentence?(text)
332
+ return true if text =~ /^附\s*录$/
333
+ return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
334
+ text = text.downcase
335
+ return true if text =~ /^appendix$/
336
+ return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
337
+ end
338
+
339
+ def guess_glossary?(text)
340
+ return false if hav_complete_sentence?(text)
341
+ return true if text =~ /^术\s*语$/
342
+ return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
343
+ text = text.downcase
344
+ return true if text =~ /^glossary$/
345
+ return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
346
+ end
347
+
348
+ def guess_digital_section?(text)
349
+ return false if hav_complete_sentence?(text)
350
+ matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
351
+ if matcher
352
+ return false if matcher[2].length == 0
353
+ level = matcher[0].split(".").count - 1
354
+ "sect#{level}".to_sym
355
+ end
356
+ end
357
+
358
+ def guess_digital_head_line?(text)
359
+ return false if hav_complete_sentence?(text)
360
+ matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
361
+ if matcher
362
+ return false if matcher[3].length == 0
363
+ levels = matcher[1].split(".")
364
+ return false if levels[0].to_i > 99
365
+ case levels.count
366
+ when 1
367
+ "chapter".to_sym
368
+ else
369
+ "sect#{levels.count - 1}".to_sym
370
+ end
371
+ end
372
+ end
373
+
374
+ def guess_head_line?(text)
375
+ return :volume if guess_volume?(text)
376
+ return :part if guess_part?(text)
377
+ return :chapter if guess_chapter?(text)
378
+ return :section if guess_section?(text)
379
+ return :preface if guess_preface?(text)
380
+ return :appendix if guess_appendix?(text)
381
+ return :index if guess_index?(text)
382
+ return :glossary if guess_glossary?(text)
383
+ end
384
+
385
+
386
+ def build_struct(content)
387
+ stack = Array.new(8)
388
+ struct = []
389
+ content.each do |line|
390
+ if line.is_a?(Hash)
391
+ case type = line[:type].to_sym
392
+ when :volume
393
+ 7.downto(0) do |index|
394
+ closed_node(struct,stack[0..index])
395
+ stack[index]=nil
396
+ end
397
+ stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
398
+ when :part
399
+ 7.downto(1) do |index|
400
+ closed_node(struct,stack[0..index])
401
+ stack[index]=nil
402
+ end
403
+ stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
404
+ when :chapter,:appendix,:index,:glossary,:preface,:afterword
405
+ 7.downto(2) do |index|
406
+ closed_node(struct,stack[0..index])
407
+ stack[index]=nil
408
+ end
409
+ stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
410
+ when :sect1
411
+ if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
412
+ stack[2][:children] << line[:title]
413
+ else
414
+ 7.downto(3) do |index|
415
+ closed_node(struct,stack[0..index])
416
+ stack[index]=nil
417
+ end
418
+ stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
419
+ end
420
+ when :sect2
421
+ 7.downto(4) do |index|
422
+ closed_node(struct,stack[0..index])
423
+ stack[index]=nil
424
+ end
425
+ stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
426
+ when :sect3
427
+ 7.downto(5) do |index|
428
+ closed_node(struct,stack[0..index])
429
+ stack[index]=nil
430
+ end
431
+ stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
432
+ when :sect4
433
+ 7.downto(6) do |index|
434
+ closed_node(struct,stack[0..index])
435
+ stack[index]=nil
436
+ end
437
+ stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
438
+ when :sect5
439
+ closed_node(struct,stack)
440
+ stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
441
+ end
442
+ else
443
+ if stack[7]
444
+ stack[7][:children] << line
445
+ elsif stack[6]
446
+ stack[6][:children] << line
447
+ elsif stack[5]
448
+ stack[5][:children] << line
449
+ elsif stack[4]
450
+ stack[4][:children] << line
451
+ elsif stack[3]
452
+ stack[3][:children] << line
453
+ elsif stack[2]
454
+ stack[2][:children] << line
455
+ elsif stack[1]
456
+ stack[1][:children] << line
457
+ elsif stack[0]
458
+ stack[0][:children] << line
459
+ else
460
+ struct << line
461
+ end
462
+ end
463
+ end
464
+
465
+ 7.downto(0) do |index|
466
+ closed_node(struct,stack[0..index])
467
+ stack[index] = nil
468
+ end
469
+
470
+ struct
471
+ end
472
+
473
+ def closed_node(struct,stack)
474
+ last = stack.pop
475
+ if last
476
+ result = false
477
+ while stack.any?
478
+ item = stack.pop
479
+ if item
480
+ item[:children] << last
481
+ result = true
482
+ break
483
+ end
484
+ end
485
+ if result == false
486
+ struct << last
487
+ end
488
+ end
489
+ end
490
+
491
+ def hav_complete_sentence?(text)
492
+ text = text.gsub(/^\d+(\.\d)*\s/,'')
493
+ text =~ /[\.。!\?!?]/
494
+ end
495
+
496
+ def extract_toc_from_struct(struct)
497
+ toc = []
498
+ struct.each do |item|
499
+ if item.is_a?(Hash)
500
+ children = []
501
+ if item[:children].any?
502
+ children = extract_toc_from_struct(item[:children])
503
+ end
504
+ item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
505
+ toc << item_hash
506
+ end
507
+ end
508
+ toc
509
+ end
510
+
511
+ def gen_docbook_toc(toc)
512
+ "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
513
+ end
514
+
515
+ def gen_docbook_tocdiv(toc)
516
+ doc_toc = []
517
+ toc.each do |item|
518
+ children = ""
519
+ if item[:children].any?
520
+ children = gen_docbook_tocdiv(item[:children])
521
+ end
522
+ doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
523
+ end
524
+ doc_toc.join("")
525
+ end
526
+
527
+ def gen_docbook_content(struct)
528
+ content = []
529
+ struct.each do |item|
530
+ if item.is_a?(Hash)
531
+ children = ""
532
+ if item[:children].any?
533
+ children = gen_docbook_content(item[:children])
534
+ end
535
+ case item[:type]
536
+ when 'volume','part'
537
+ content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
538
+ when 'chapter','appendix','glossary','index','preface'
539
+ content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
540
+ when 'sect1','sect2','sect3','sect4','sect5'
541
+ content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
542
+ end
543
+ else
544
+ text = escape_html(clean_text(item))
545
+ if text.length > 0
546
+ content << "<para id='#{UUID.generate}'>#{text}</para>"
547
+ end
548
+ end
549
+ end
550
+ content.join("\n")
551
+ end
552
+
553
+ def to_utf8(text,encoding='GB2312')
554
+ doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
555
+ doc.join("")
556
+ #text.encode(encoding)
557
+ rescue
558
+ text
559
+ end
560
+
561
+ def detect_utf8(content)
562
+ content.each_line{|line| line.strip}
563
+ true
564
+ rescue
565
+ false
566
+ end
567
+
568
+ # sanitize_for_epub_text
569
+ def sanitize_for_epub_text(content)
570
+ lines = []
571
+ content.each_line do |line|
572
+ unless line.downcase.include?('document outline')
573
+ lines << line
574
+ else
575
+ break;
576
+ end
577
+ end
578
+ lines.join("")
579
+ end
580
+
581
+ # clean_text
582
+ # 获得干净的文本,去除两边的空格和回车
583
+ def clean_text(text)
584
+ return text if text.nil?
585
+ text = text.strip
586
+ text.gsub("\n",'')
587
+ end
588
+
589
+ # escape_html
590
+ # 文本转义,在txt文本转html时需要使用
591
+ def escape_html(text)
592
+ CGI::escapeHTML(text)
593
+ end
594
+ end
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extract_book_struct
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aaron
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: uuid
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: iconv
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: 书结构信息提取工具.
47
+ email: aaron@nonobo.com
48
+ executables:
49
+ - extract_book_struct
50
+ - batch_extract_book_struct
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - README
55
+ - CHANGELOG
56
+ - bin/extract_book_struct
57
+ - bin/batch_extract_book_struct
58
+ - lib/extract_book_struct.rb
59
+ - lib/batch_extract.rb
60
+ homepage:
61
+ licenses: []
62
+ post_install_message:
63
+ rdoc_options:
64
+ - --charset=UTF-8
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ requirements:
80
+ - none
81
+ rubyforge_project:
82
+ rubygems_version: 1.8.25
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: 书结构信息提取工具.
86
+ test_files: []