extract_book_struct 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/README +76 -0
- data/bin/batch_extract_book_struct +58 -0
- data/bin/extract_book_struct +95 -0
- data/lib/batch_extract.rb +76 -0
- data/lib/extract_book_struct.rb +594 -0
- metadata +86 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# = ExtractBookStruct
|
3
|
+
# ExtractBookStruct的目的是从各类电子书内容中提取书的结构信息。目前支持txt,epub,html。
|
4
|
+
# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
|
5
|
+
# 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
|
6
|
+
# 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
|
7
|
+
# 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
|
8
|
+
# 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
|
9
|
+
# 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
|
10
|
+
# 6. 每个结构信息都应该独立成行。
|
11
|
+
#
|
12
|
+
# 文档结构信息分析
|
13
|
+
# 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
|
14
|
+
# 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
|
15
|
+
# 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
|
16
|
+
# 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
|
17
|
+
# 根据不同的类型,对结构信息的提取采用不同的处理手段。
|
18
|
+
#
|
19
|
+
# 有效的标题信息应该符合以下规则:
|
20
|
+
# 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
|
21
|
+
# 2. 应该包含结构信息表述,具体如下:
|
22
|
+
# 文本描述:
|
23
|
+
# 卷: 以"第xxx卷"开始
|
24
|
+
# 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
25
|
+
# 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
26
|
+
# 部分(篇): 以"第xxx部"或"第xxx篇"开始
|
27
|
+
# 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
28
|
+
# 章(回): 以"第xxx章"或"第xxx回"开始
|
29
|
+
# 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
30
|
+
# 节: 以"第xxx节"开始
|
31
|
+
# 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
|
32
|
+
# 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
|
33
|
+
# 单个"序"
|
34
|
+
# 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
35
|
+
# "preface"
|
36
|
+
# "foreword"
|
37
|
+
# 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
38
|
+
# 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
|
39
|
+
# 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
40
|
+
# "index"
|
41
|
+
# 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
42
|
+
# 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
|
43
|
+
# 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
44
|
+
# "appendix"
|
45
|
+
# 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
46
|
+
# 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
|
47
|
+
# 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
48
|
+
# "glossary"
|
49
|
+
# 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
50
|
+
#
|
51
|
+
# 数字描述:
|
52
|
+
# 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
|
53
|
+
#
|
54
|
+
# ==API接口
|
55
|
+
#
|
56
|
+
# === ExtractBookStruct.from_txt
|
57
|
+
# 从文本文件中提取目录结构,使用示例:
|
58
|
+
# ExtractBookStruct.from_txt('1.txt',{:title=>'title',:author=>'author'})
|
59
|
+
#
|
60
|
+
# === ExtractBookStruct.from_epub
|
61
|
+
# 从EPUB文件中提取目录结构,使用示例:
|
62
|
+
# ExtractBookStruct.from_epub('1.epub',{:title=>'title',:author=>'author'})
|
63
|
+
#
|
64
|
+
# === ExtractBookStruct.from_html
|
65
|
+
# 从HTML中提取目录结构,使用示例:
|
66
|
+
# ExtractBookStruct.from_html('1.html',{:title=>'title',:author=>'author'})
|
67
|
+
#
|
68
|
+
# == 命令行工具
|
69
|
+
# extract_book_struct,使用示例:
|
70
|
+
# extract_book_struct '1.txt', '1.xml'
|
71
|
+
#
|
72
|
+
# == 依赖
|
73
|
+
# ExtractBookStruct依赖以下工具和包:
|
74
|
+
# ebook-convert: calibre cli tools.
|
75
|
+
# uuid: ruby gem.
|
76
|
+
# iconv: ruby gem.
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
require 'rubygems'
|
4
|
+
require 'optparse'
|
5
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','batch_extract')
|
6
|
+
|
7
|
+
def help
|
8
|
+
puts <<-EOF
|
9
|
+
extract_book_struct: 批量提取书结构信息
|
10
|
+
usage:
|
11
|
+
batch_extract_book_struct [options] source_dir destination_dir
|
12
|
+
|
13
|
+
source_dir: 指定需要提取结构信息的书所在目录
|
14
|
+
destination_dir: 指定提取的书结构信息所输出的文件目录
|
15
|
+
|
16
|
+
options:
|
17
|
+
-F,--format 指定要提取书的格式
|
18
|
+
|
19
|
+
适用对象要求:
|
20
|
+
1. 编码格式为utf-8
|
21
|
+
EOF
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
|
25
|
+
options = {}
|
26
|
+
opts = OptionParser.new do |opts|
|
27
|
+
|
28
|
+
opts.on('-F format','--format format','format') do |format|
|
29
|
+
options[:format] = format
|
30
|
+
end
|
31
|
+
|
32
|
+
opts.on('-h','--help') do
|
33
|
+
help
|
34
|
+
exit
|
35
|
+
end
|
36
|
+
end
|
37
|
+
opts.parse ARGV
|
38
|
+
|
39
|
+
source_dir = ARGV[-2]
|
40
|
+
destination_dir = ARGV[-1]
|
41
|
+
|
42
|
+
if source_dir.nil? || destination_dir.nil?
|
43
|
+
help
|
44
|
+
exit
|
45
|
+
end
|
46
|
+
|
47
|
+
unless File.directory?(source_dir)
|
48
|
+
puts "error: source_dir #{source_dir} not is directory"
|
49
|
+
else
|
50
|
+
begin
|
51
|
+
FileUtils.mkdir_p(destination_dir) unless Dir.exists?(destination_dir)
|
52
|
+
rescue
|
53
|
+
puts "error: destination_dir #{destination_dir} not created"
|
54
|
+
exit
|
55
|
+
end
|
56
|
+
|
57
|
+
BatchExtract.batch_extract_from_dir(source_dir,destination_dir,options)
|
58
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
require 'rubygems'
|
4
|
+
require 'optparse'
|
5
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','extract_book_struct')
|
6
|
+
|
7
|
+
def help
|
8
|
+
puts <<-EOF
|
9
|
+
extract_book_struct: 提取书结构信息
|
10
|
+
usage:
|
11
|
+
extract_book_struct [options] source_file docbook_file
|
12
|
+
|
13
|
+
source_file: 指定需要提取结构信息的书文件
|
14
|
+
docbook_file: 指定提取的书结构信息所输出的文件
|
15
|
+
|
16
|
+
options:
|
17
|
+
-T <title>, --title <title> : 书的标题
|
18
|
+
-A <author>, --author <author> : 书作者
|
19
|
+
--pubdate <pubdate> : 出版时间
|
20
|
+
--publisher <publisher> : 出版社
|
21
|
+
|
22
|
+
适用对象要求:
|
23
|
+
1. 编码格式为utf-8
|
24
|
+
EOF
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
options = {}
|
29
|
+
opts = OptionParser.new do |opts|
|
30
|
+
|
31
|
+
opts.on('-T title','--title title','title') do |title|
|
32
|
+
options[:title] = title
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-A author','--author author','author') do |author|
|
36
|
+
options[:author] = author
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('--publisher publisher','publisher') do |publisher|
|
40
|
+
options[:publisher] = publisher
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('--pubdate pubdate','pubdate') do |pubdate|
|
44
|
+
options[:pubdate] = pubdate
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on('-h','--help') do
|
48
|
+
help
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end
|
52
|
+
opts.parse ARGV
|
53
|
+
|
54
|
+
source_file = ARGV[-2]
|
55
|
+
docbook_file = ARGV[-1]
|
56
|
+
|
57
|
+
if source_file.nil? || docbook_file.nil?
|
58
|
+
help
|
59
|
+
exit
|
60
|
+
end
|
61
|
+
|
62
|
+
unless File.exists?(source_file)
|
63
|
+
puts "error: source_file #{source_file} no found"
|
64
|
+
else
|
65
|
+
begin
|
66
|
+
dest_path = File.dirname(docbook_file)
|
67
|
+
FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
|
68
|
+
rescue
|
69
|
+
puts "error: docbook_file #{docbook_file} not created"
|
70
|
+
exit
|
71
|
+
end
|
72
|
+
ext_name = File.extname(source_file).downcase
|
73
|
+
options[:title] ||= File.basename(source_file,ext_name)
|
74
|
+
unless ['.html','.txt','.epub'].include?(ext_name)
|
75
|
+
puts "source_file不是允许的文件格式: txt,html,epub"
|
76
|
+
exit
|
77
|
+
end
|
78
|
+
|
79
|
+
begin
|
80
|
+
docbook_xml = case ext_name
|
81
|
+
when '.html'
|
82
|
+
ExtractBookStruct.from_html(source_file,options)
|
83
|
+
when '.txt'
|
84
|
+
ExtractBookStruct.from_txt(source_file,options)
|
85
|
+
when '.epub'
|
86
|
+
ExtractBookStruct.from_epub(source_file,options)
|
87
|
+
end
|
88
|
+
if docbook_xml
|
89
|
+
File.open(docbook_file,'wb'){|file|file.write docbook_xml}
|
90
|
+
puts "success: extract book struct successfully!"
|
91
|
+
end
|
92
|
+
rescue => e
|
93
|
+
puts "error: #{source_file} \n#{e.backtrace.join("\n")}"
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'pathname'
|
3
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','extract_book_struct')
|
4
|
+
|
5
|
+
module BatchExtract
|
6
|
+
extend self
|
7
|
+
# batch_extract_from_dir
|
8
|
+
# batch extract book struct form dir
|
9
|
+
# parameters:
|
10
|
+
# +source+ source directory
|
11
|
+
# +destination+ output directory
|
12
|
+
# +options+ optional parameter.
|
13
|
+
# :format 指定需要提取结构的文件后缀名,例如要从所有txt文件中提取,通过:format=>'.txt'指定
|
14
|
+
def batch_extract_from_dir(source,destination,options={})
|
15
|
+
format = options.delete(:format)
|
16
|
+
files = scan_file_from_dir(source,{:format=>format})
|
17
|
+
|
18
|
+
files.each do |file|
|
19
|
+
extname = File.extname(file)
|
20
|
+
basename = File.basename(file,extname)
|
21
|
+
dest_file = File.join(File.dirname(File.join(destination,file.gsub(source,''))),"#{basename}.xml")
|
22
|
+
puts "start extract #{file} ..."
|
23
|
+
begin
|
24
|
+
docbook_xml = case extname
|
25
|
+
when '.html'
|
26
|
+
ExtractBookStruct.from_html(file,options)
|
27
|
+
when '.txt'
|
28
|
+
ExtractBookStruct.from_txt(file,options)
|
29
|
+
when '.epub'
|
30
|
+
ExtractBookStruct.from_epub(file,options)
|
31
|
+
else
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
if docbook_xml
|
35
|
+
File.open(dest_file,'wb'){|file|file.write docbook_xml}
|
36
|
+
puts "success: extract book struct successfully!"
|
37
|
+
end
|
38
|
+
#rescue => e
|
39
|
+
# puts "error: #{file} \n#{e.backtrace.join("\n")}"
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# scan_file_from_dir
|
46
|
+
# 遍历目录下的文件
|
47
|
+
# parameters:
|
48
|
+
# +dir+ 需遍历的目录
|
49
|
+
# +options+ 可选参数
|
50
|
+
# :format 指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定
|
51
|
+
def scan_file_from_dir(dir,options={})
|
52
|
+
files = []
|
53
|
+
walk_dir(dir,options) do |file|
|
54
|
+
files << file.to_s
|
55
|
+
end
|
56
|
+
files
|
57
|
+
end
|
58
|
+
|
59
|
+
def walk_dir(path_str,options={})
|
60
|
+
path = Pathname.new(path_str)
|
61
|
+
format = options[:format]
|
62
|
+
path.children.each do |entry|
|
63
|
+
if entry.directory?
|
64
|
+
walk_dir(entry) {|x| yield(x)}
|
65
|
+
elsif entry.file?
|
66
|
+
if format
|
67
|
+
if entry.extname == format
|
68
|
+
yield entry
|
69
|
+
end
|
70
|
+
else
|
71
|
+
yield entry
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,594 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# =ExtractBookStruct
|
3
|
+
# ExtractBookStruct的目的是提取书的结构信息。
|
4
|
+
# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
|
5
|
+
# 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
|
6
|
+
# 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
|
7
|
+
# 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
|
8
|
+
# 4. 文档必须符合正常的文档流(错位的章节段落等情况将影响正常的结构提取)
|
9
|
+
# 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
|
10
|
+
# 6. 每个结构信息都应该独立成行。
|
11
|
+
#
|
12
|
+
# 文档结构信息分析
|
13
|
+
# 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
|
14
|
+
# 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
|
15
|
+
# 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
|
16
|
+
# 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
|
17
|
+
# 根据不同的类型,对结构信息的提取采用不同的处理手段。
|
18
|
+
#
|
19
|
+
# 有效的标题信息应该符合以下规则:
|
20
|
+
# 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
|
21
|
+
# 2. 应该包含结构信息表述,具体如下:
|
22
|
+
# 文本描述:
|
23
|
+
# 卷: 以"第xxx卷"开始
|
24
|
+
# 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
25
|
+
# 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
26
|
+
# 部分(篇): 以"第xxx部"或"第xxx篇"开始
|
27
|
+
# 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
28
|
+
# 章(回): 以"第xxx章"或"第xxx回"开始
|
29
|
+
# 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
30
|
+
# 节: 以"第xxx节"开始
|
31
|
+
# 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
|
32
|
+
# 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
|
33
|
+
# 单个"序"
|
34
|
+
# 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
35
|
+
# "preface"
|
36
|
+
# "foreword"
|
37
|
+
# 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
38
|
+
# 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
|
39
|
+
# 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
40
|
+
# "index"
|
41
|
+
# 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
42
|
+
# 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
|
43
|
+
# 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
44
|
+
# "appendix"
|
45
|
+
# 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
46
|
+
# 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
|
47
|
+
# 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
48
|
+
# "glossary"
|
49
|
+
# 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
50
|
+
#
|
51
|
+
# 数字描述:
|
52
|
+
# 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
|
53
|
+
#
|
54
|
+
# ==接口
|
55
|
+
#
|
56
|
+
# === ExtractBookStruct.from_txt
|
57
|
+
# 从文本文件中提取目录结构
|
58
|
+
#
|
59
|
+
# === ExtractBookStruct.from_epub
|
60
|
+
# 从EPUB文件中提取目录结构
|
61
|
+
#
|
62
|
+
# === ExtractBookStruct.from_html
|
63
|
+
# 从HTML中提取目录结构
|
64
|
+
|
65
|
+
require 'uuid'
|
66
|
+
require 'cgi'
|
67
|
+
require 'iconv'
|
68
|
+
|
69
|
+
module ExtractBookStruct
|
70
|
+
extend self
|
71
|
+
def from_txt(filename,options={})
|
72
|
+
content = File.open(filename).read
|
73
|
+
unless detect_utf8(content)
|
74
|
+
content = to_utf8(content)
|
75
|
+
end
|
76
|
+
content = sanitize_for_epub_text(content)
|
77
|
+
paras = extract_paras(content)
|
78
|
+
extract_book_struct(paras,options)
|
79
|
+
end
|
80
|
+
|
81
|
+
def from_html(filename,options={})
|
82
|
+
content = extract_text_from_file(filename,'.html')
|
83
|
+
content = to_utf8(content) unless detect_utf8(content)
|
84
|
+
paras = extract_paras(content)
|
85
|
+
extract_book_struct(paras,options)
|
86
|
+
end
|
87
|
+
|
88
|
+
def from_epub(filename,options={})
|
89
|
+
content = extract_text_from_file(filename,'.epub')
|
90
|
+
content = to_utf8(content) unless detect_utf8(content)
|
91
|
+
paras = extract_paras(content)
|
92
|
+
extract_book_struct(paras,options)
|
93
|
+
end
|
94
|
+
|
95
|
+
def extract_book_struct(paras,options={})
|
96
|
+
# 检查书类型(text,digital,hybrid)
|
97
|
+
format = options[:format] || detect_struct_type(paras)
|
98
|
+
case format
|
99
|
+
when :text
|
100
|
+
extract_text_book_struct(paras,options)
|
101
|
+
when :digital
|
102
|
+
extract_digital_book_struct(paras,options)
|
103
|
+
when :hybrid
|
104
|
+
extract_hybrid_book_struct(paras,options)
|
105
|
+
else
|
106
|
+
puts "警告: 没有检测到书结构信息."
|
107
|
+
return nil
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def extract_text_from_file(filename,format)
|
112
|
+
txt_file = File.basename(filename,format)
|
113
|
+
cmd = "ebook-convert #{filename} #{txt_file}.txt"
|
114
|
+
output = `#{cmd}`
|
115
|
+
content = File.open("#{txt_file}.txt").read
|
116
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
117
|
+
sanitize_for_epub_text(content)
|
118
|
+
end
|
119
|
+
|
120
|
+
def extract_paras(content)
|
121
|
+
paras = []
|
122
|
+
content.each_line do |line|
|
123
|
+
text = clean_text(line)
|
124
|
+
paras << text if text.length > 0
|
125
|
+
end
|
126
|
+
paras
|
127
|
+
end
|
128
|
+
|
129
|
+
def detect_struct_type(paras)
|
130
|
+
text_flag = false
|
131
|
+
digital_flag = false
|
132
|
+
paras.each do |para|
|
133
|
+
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
134
|
+
text_flag = true
|
135
|
+
end
|
136
|
+
|
137
|
+
if guess_digital_head_line?(para)
|
138
|
+
digital_flag = true
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
if text_flag && digital_flag
|
143
|
+
:hybrid
|
144
|
+
elsif text_flag
|
145
|
+
:text
|
146
|
+
elsif digital_flag
|
147
|
+
:digital
|
148
|
+
else
|
149
|
+
:unknown
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# 从text类型书中提取结构
|
154
|
+
def extract_text_book_struct(content,options={})
|
155
|
+
# 标注结构信息
|
156
|
+
marked_content = mark_struct_info(content)
|
157
|
+
|
158
|
+
# 构建书结构
|
159
|
+
struct = build_struct(marked_content)
|
160
|
+
|
161
|
+
# 修正结构
|
162
|
+
revised_struct = revise_struct(struct)
|
163
|
+
|
164
|
+
# 生成docbook
|
165
|
+
build_doc_book(revised_struct,options)
|
166
|
+
end
|
167
|
+
|
168
|
+
# 从数字类型书中提取结构
|
169
|
+
def extract_digital_book_struct(content,options={})
|
170
|
+
marked_content = mark_digital_struct_info(content)
|
171
|
+
|
172
|
+
# 构建书结构
|
173
|
+
struct = build_struct(marked_content)
|
174
|
+
|
175
|
+
# 修正结构
|
176
|
+
revised_struct = revise_struct(struct)
|
177
|
+
|
178
|
+
# 生成docbook
|
179
|
+
build_doc_book(revised_struct,options)
|
180
|
+
end
|
181
|
+
|
182
|
+
# 从混合类型书中提取结构
|
183
|
+
def extract_hybrid_book_struct(content,options={})
|
184
|
+
marked_content = mark_hybrid_struct_info(content)
|
185
|
+
|
186
|
+
# 构建书结构
|
187
|
+
struct = build_struct(marked_content)
|
188
|
+
|
189
|
+
# 修正结构
|
190
|
+
revised_struct = revise_struct(struct)
|
191
|
+
|
192
|
+
# 生成docbook
|
193
|
+
build_doc_book(revised_struct,options)
|
194
|
+
end
|
195
|
+
|
196
|
+
# 标注结构信息
|
197
|
+
# 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
|
198
|
+
def mark_struct_info(content)
|
199
|
+
marked_content = []
|
200
|
+
content.each do |text|
|
201
|
+
if text.length > 0
|
202
|
+
type = guess_head_line?(text)
|
203
|
+
if type
|
204
|
+
marked_content << {:title=>text,:type=>type}
|
205
|
+
else
|
206
|
+
marked_content << text
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
marked_content
|
211
|
+
end
|
212
|
+
|
213
|
+
def mark_hybrid_struct_info(content)
|
214
|
+
marked_content = []
|
215
|
+
content.each do |text|
|
216
|
+
if text.length > 0
|
217
|
+
type = guess_head_line?(text)
|
218
|
+
if type
|
219
|
+
marked_content << {:title=>text,:type=>type}
|
220
|
+
else
|
221
|
+
type = guess_digital_section?(text)
|
222
|
+
if type
|
223
|
+
marked_content << {:title=>text,:type=>type}
|
224
|
+
else
|
225
|
+
marked_content << text
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
marked_content
|
231
|
+
end
|
232
|
+
|
233
|
+
def mark_digital_struct_info(content)
|
234
|
+
marked_content = []
|
235
|
+
content.each do |text|
|
236
|
+
if text.length > 0
|
237
|
+
type = guess_head_line?(text)
|
238
|
+
if type
|
239
|
+
marked_content << {:title=>text,:type=>type}
|
240
|
+
else
|
241
|
+
type = guess_digital_head_line?(text)
|
242
|
+
if type
|
243
|
+
marked_content << {:title=>text,:type=>type}
|
244
|
+
else
|
245
|
+
marked_content << text
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
marked_content
|
251
|
+
end
|
252
|
+
|
253
|
+
# 修正结构 TODO
|
254
|
+
def revise_struct(struct)
|
255
|
+
struct
|
256
|
+
end
|
257
|
+
|
258
|
+
def build_doc_book(struct,options={})
|
259
|
+
toc = extract_toc_from_struct(struct)
|
260
|
+
|
261
|
+
doc_toc = gen_docbook_toc(toc)
|
262
|
+
|
263
|
+
struct = struct.map{|item| item if item.is_a?(Hash)}.compact
|
264
|
+
|
265
|
+
doc_content = gen_docbook_content(struct)
|
266
|
+
|
267
|
+
<<-EOS
|
268
|
+
<?xml version="1.0" encoding="utf-8"?>
|
269
|
+
<book xmlns="http://docbook.org/ns/docbook" version="5.0">
|
270
|
+
<info>
|
271
|
+
<title>#{options[:title]}</title>
|
272
|
+
<author>#{options[:author]}</author>
|
273
|
+
<pubdate>#{options[:pubdate]}</pubdate>
|
274
|
+
<publisher>#{options[:publisher]}</publisher>
|
275
|
+
</info>
|
276
|
+
#{doc_toc}
|
277
|
+
#{doc_content}
|
278
|
+
</book>
|
279
|
+
EOS
|
280
|
+
end
|
281
|
+
|
282
|
+
def guess_volume?(text,options={})
|
283
|
+
return false if hav_complete_sentence?(text)
|
284
|
+
return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
|
285
|
+
text = text.downcase
|
286
|
+
return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
287
|
+
end
|
288
|
+
|
289
|
+
def guess_part?(text,options={})
|
290
|
+
return false if hav_complete_sentence?(text)
|
291
|
+
return true if text =~ /^第.{1,3}[部篇]/
|
292
|
+
text = text.downcase
|
293
|
+
return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
294
|
+
end
|
295
|
+
|
296
|
+
def guess_chapter?(text)
|
297
|
+
return false if hav_complete_sentence?(text)
|
298
|
+
return true if text =~ /^第.{1,4}[章回]/
|
299
|
+
text = text.downcase
|
300
|
+
return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
301
|
+
end
|
302
|
+
|
303
|
+
def guess_section?(text)
|
304
|
+
return false if hav_complete_sentence?(text)
|
305
|
+
return true if text =~ /^第.{1,3}[节]/
|
306
|
+
end
|
307
|
+
|
308
|
+
def guess_preface?(text)
|
309
|
+
return false if hav_complete_sentence?(text)
|
310
|
+
return true if text =~ /^前\s*言$/
|
311
|
+
return true if text =~ /^序\s*言$/
|
312
|
+
return true if text =~ /^序$/
|
313
|
+
return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
314
|
+
text = text.downcase
|
315
|
+
return true if text =~ /^preface$/
|
316
|
+
return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
317
|
+
return true if text =~ /^foreword$/
|
318
|
+
return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
319
|
+
end
|
320
|
+
|
321
|
+
def guess_index?(text)
|
322
|
+
return false if hav_complete_sentence?(text)
|
323
|
+
return true if text =~ /^索\s*引$/
|
324
|
+
return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
325
|
+
text = text.downcase
|
326
|
+
return true if text =~ /^index$/
|
327
|
+
return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
328
|
+
end
|
329
|
+
|
330
|
+
def guess_appendix?(text)
|
331
|
+
return false if hav_complete_sentence?(text)
|
332
|
+
return true if text =~ /^附\s*录$/
|
333
|
+
return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
|
334
|
+
text = text.downcase
|
335
|
+
return true if text =~ /^appendix$/
|
336
|
+
return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
|
337
|
+
end
|
338
|
+
|
339
|
+
def guess_glossary?(text)
|
340
|
+
return false if hav_complete_sentence?(text)
|
341
|
+
return true if text =~ /^术\s*语$/
|
342
|
+
return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
343
|
+
text = text.downcase
|
344
|
+
return true if text =~ /^glossary$/
|
345
|
+
return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
346
|
+
end
|
347
|
+
|
348
|
+
def guess_digital_section?(text)
|
349
|
+
return false if hav_complete_sentence?(text)
|
350
|
+
matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
|
351
|
+
if matcher
|
352
|
+
return false if matcher[2].length == 0
|
353
|
+
level = matcher[0].split(".").count - 1
|
354
|
+
"sect#{level}".to_sym
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
def guess_digital_head_line?(text)
|
359
|
+
return false if hav_complete_sentence?(text)
|
360
|
+
matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
|
361
|
+
if matcher
|
362
|
+
return false if matcher[3].length == 0
|
363
|
+
levels = matcher[1].split(".")
|
364
|
+
return false if levels[0].to_i > 99
|
365
|
+
case levels.count
|
366
|
+
when 1
|
367
|
+
"chapter".to_sym
|
368
|
+
else
|
369
|
+
"sect#{levels.count - 1}".to_sym
|
370
|
+
end
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
def guess_head_line?(text)
|
375
|
+
return :volume if guess_volume?(text)
|
376
|
+
return :part if guess_part?(text)
|
377
|
+
return :chapter if guess_chapter?(text)
|
378
|
+
return :section if guess_section?(text)
|
379
|
+
return :preface if guess_preface?(text)
|
380
|
+
return :appendix if guess_appendix?(text)
|
381
|
+
return :index if guess_index?(text)
|
382
|
+
return :glossary if guess_glossary?(text)
|
383
|
+
end
|
384
|
+
|
385
|
+
|
386
|
+
def build_struct(content)
|
387
|
+
stack = Array.new(8)
|
388
|
+
struct = []
|
389
|
+
content.each do |line|
|
390
|
+
if line.is_a?(Hash)
|
391
|
+
case type = line[:type].to_sym
|
392
|
+
when :volume
|
393
|
+
7.downto(0) do |index|
|
394
|
+
closed_node(struct,stack[0..index])
|
395
|
+
stack[index]=nil
|
396
|
+
end
|
397
|
+
stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
398
|
+
when :part
|
399
|
+
7.downto(1) do |index|
|
400
|
+
closed_node(struct,stack[0..index])
|
401
|
+
stack[index]=nil
|
402
|
+
end
|
403
|
+
stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
404
|
+
when :chapter,:appendix,:index,:glossary,:preface,:afterword
|
405
|
+
7.downto(2) do |index|
|
406
|
+
closed_node(struct,stack[0..index])
|
407
|
+
stack[index]=nil
|
408
|
+
end
|
409
|
+
stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
410
|
+
when :sect1
|
411
|
+
if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
|
412
|
+
stack[2][:children] << line[:title]
|
413
|
+
else
|
414
|
+
7.downto(3) do |index|
|
415
|
+
closed_node(struct,stack[0..index])
|
416
|
+
stack[index]=nil
|
417
|
+
end
|
418
|
+
stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
419
|
+
end
|
420
|
+
when :sect2
|
421
|
+
7.downto(4) do |index|
|
422
|
+
closed_node(struct,stack[0..index])
|
423
|
+
stack[index]=nil
|
424
|
+
end
|
425
|
+
stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
426
|
+
when :sect3
|
427
|
+
7.downto(5) do |index|
|
428
|
+
closed_node(struct,stack[0..index])
|
429
|
+
stack[index]=nil
|
430
|
+
end
|
431
|
+
stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
432
|
+
when :sect4
|
433
|
+
7.downto(6) do |index|
|
434
|
+
closed_node(struct,stack[0..index])
|
435
|
+
stack[index]=nil
|
436
|
+
end
|
437
|
+
stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
438
|
+
when :sect5
|
439
|
+
closed_node(struct,stack)
|
440
|
+
stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
|
441
|
+
end
|
442
|
+
else
|
443
|
+
if stack[7]
|
444
|
+
stack[7][:children] << line
|
445
|
+
elsif stack[6]
|
446
|
+
stack[6][:children] << line
|
447
|
+
elsif stack[5]
|
448
|
+
stack[5][:children] << line
|
449
|
+
elsif stack[4]
|
450
|
+
stack[4][:children] << line
|
451
|
+
elsif stack[3]
|
452
|
+
stack[3][:children] << line
|
453
|
+
elsif stack[2]
|
454
|
+
stack[2][:children] << line
|
455
|
+
elsif stack[1]
|
456
|
+
stack[1][:children] << line
|
457
|
+
elsif stack[0]
|
458
|
+
stack[0][:children] << line
|
459
|
+
else
|
460
|
+
struct << line
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|
465
|
+
7.downto(0) do |index|
|
466
|
+
closed_node(struct,stack[0..index])
|
467
|
+
stack[index] = nil
|
468
|
+
end
|
469
|
+
|
470
|
+
struct
|
471
|
+
end
|
472
|
+
|
473
|
+
def closed_node(struct,stack)
|
474
|
+
last = stack.pop
|
475
|
+
if last
|
476
|
+
result = false
|
477
|
+
while stack.any?
|
478
|
+
item = stack.pop
|
479
|
+
if item
|
480
|
+
item[:children] << last
|
481
|
+
result = true
|
482
|
+
break
|
483
|
+
end
|
484
|
+
end
|
485
|
+
if result == false
|
486
|
+
struct << last
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
def hav_complete_sentence?(text)
|
492
|
+
text = text.gsub(/^\d+(\.\d)*\s/,'')
|
493
|
+
text =~ /[\.。!\?!?]/
|
494
|
+
end
|
495
|
+
|
496
|
+
def extract_toc_from_struct(struct)
|
497
|
+
toc = []
|
498
|
+
struct.each do |item|
|
499
|
+
if item.is_a?(Hash)
|
500
|
+
children = []
|
501
|
+
if item[:children].any?
|
502
|
+
children = extract_toc_from_struct(item[:children])
|
503
|
+
end
|
504
|
+
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
505
|
+
toc << item_hash
|
506
|
+
end
|
507
|
+
end
|
508
|
+
toc
|
509
|
+
end
|
510
|
+
|
511
|
+
def gen_docbook_toc(toc)
|
512
|
+
"<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
|
513
|
+
end
|
514
|
+
|
515
|
+
def gen_docbook_tocdiv(toc)
|
516
|
+
doc_toc = []
|
517
|
+
toc.each do |item|
|
518
|
+
children = ""
|
519
|
+
if item[:children].any?
|
520
|
+
children = gen_docbook_tocdiv(item[:children])
|
521
|
+
end
|
522
|
+
doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
|
523
|
+
end
|
524
|
+
doc_toc.join("")
|
525
|
+
end
|
526
|
+
|
527
|
+
def gen_docbook_content(struct)
|
528
|
+
content = []
|
529
|
+
struct.each do |item|
|
530
|
+
if item.is_a?(Hash)
|
531
|
+
children = ""
|
532
|
+
if item[:children].any?
|
533
|
+
children = gen_docbook_content(item[:children])
|
534
|
+
end
|
535
|
+
case item[:type]
|
536
|
+
when 'volume','part'
|
537
|
+
content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
|
538
|
+
when 'chapter','appendix','glossary','index','preface'
|
539
|
+
content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
|
540
|
+
when 'sect1','sect2','sect3','sect4','sect5'
|
541
|
+
content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
|
542
|
+
end
|
543
|
+
else
|
544
|
+
text = escape_html(clean_text(item))
|
545
|
+
if text.length > 0
|
546
|
+
content << "<para id='#{UUID.generate}'>#{text}</para>"
|
547
|
+
end
|
548
|
+
end
|
549
|
+
end
|
550
|
+
content.join("\n")
|
551
|
+
end
|
552
|
+
|
553
|
+
def to_utf8(text,encoding='GB2312')
|
554
|
+
doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
|
555
|
+
doc.join("")
|
556
|
+
#text.encode(encoding)
|
557
|
+
rescue
|
558
|
+
text
|
559
|
+
end
|
560
|
+
|
561
|
+
def detect_utf8(content)
|
562
|
+
content.each_line{|line| line.strip}
|
563
|
+
true
|
564
|
+
rescue
|
565
|
+
false
|
566
|
+
end
|
567
|
+
|
568
|
+
# sanitize_for_epub_text
|
569
|
+
def sanitize_for_epub_text(content)
|
570
|
+
lines = []
|
571
|
+
content.each_line do |line|
|
572
|
+
unless line.downcase.include?('document outline')
|
573
|
+
lines << line
|
574
|
+
else
|
575
|
+
break;
|
576
|
+
end
|
577
|
+
end
|
578
|
+
lines.join("")
|
579
|
+
end
|
580
|
+
|
581
|
+
# clean_text
|
582
|
+
# 获得干净的文本,去除两边的空格和回车
|
583
|
+
def clean_text(text)
|
584
|
+
return text if text.nil?
|
585
|
+
text = text.strip
|
586
|
+
text.gsub("\n",'')
|
587
|
+
end
|
588
|
+
|
589
|
+
# escape_html
|
590
|
+
# 文本转义,在txt文本转html时需要使用
|
591
|
+
def escape_html(text)
|
592
|
+
CGI::escapeHTML(text)
|
593
|
+
end
|
594
|
+
end
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: extract_book_struct
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-29 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: uuid
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: iconv
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: 书结构信息提取工具.
|
47
|
+
email: aaron@nonobo.com
|
48
|
+
executables:
|
49
|
+
- extract_book_struct
|
50
|
+
- batch_extract_book_struct
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- README
|
55
|
+
- CHANGELOG
|
56
|
+
- bin/extract_book_struct
|
57
|
+
- bin/batch_extract_book_struct
|
58
|
+
- lib/extract_book_struct.rb
|
59
|
+
- lib/batch_extract.rb
|
60
|
+
homepage:
|
61
|
+
licenses: []
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options:
|
64
|
+
- --charset=UTF-8
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ! '>='
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
75
|
+
requirements:
|
76
|
+
- - ! '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
requirements:
|
80
|
+
- none
|
81
|
+
rubyforge_project:
|
82
|
+
rubygems_version: 1.8.25
|
83
|
+
signing_key:
|
84
|
+
specification_version: 3
|
85
|
+
summary: 书结构信息提取工具.
|
86
|
+
test_files: []
|