ebook_tools 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/README +76 -0
- data/bin/ebook_tools +196 -0
- data/ebook_tools.gemspec +38 -0
- data/lib/ebook_tools.rb +248 -0
- data/lib/epub.rb +104 -0
- data/lib/extract_book_struct.rb +415 -0
- data/lib/header_detect.rb +161 -0
- data/lib/pdf.rb +265 -0
- data/lib/txt.rb +108 -0
- data/lib/utils.rb +224 -0
- metadata +170 -0
@@ -0,0 +1,161 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# HeaderDetect
|
3
|
+
# HeaderDetect模块提供对标题的检测
|
4
|
+
#
|
5
|
+
# 文档结构信息分析
|
6
|
+
# 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
|
7
|
+
# 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
|
8
|
+
# 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
|
9
|
+
# 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
|
10
|
+
# 根据不同的类型,对结构信息的提取采用不同的处理手段。
|
11
|
+
#
|
12
|
+
# 有效的标题信息应该符合以下规则:
|
13
|
+
# 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
|
14
|
+
# 2. 应该包含结构信息表述,具体如下:
|
15
|
+
# 文本描述:
|
16
|
+
# 卷: 以"第xxx卷"开始
|
17
|
+
# 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
18
|
+
# 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
19
|
+
# 部分(篇): 以"第xxx部"或"第xxx篇"开始
|
20
|
+
# 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
21
|
+
# 章(回): 以"第xxx章"或"第xxx回"开始
|
22
|
+
# 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
23
|
+
# 节: 以"第xxx节"开始
|
24
|
+
# 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
|
25
|
+
# 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
|
26
|
+
# 单个"序"
|
27
|
+
# 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
28
|
+
# "preface"
|
29
|
+
# "foreword"
|
30
|
+
# 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
31
|
+
# 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
|
32
|
+
# 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
33
|
+
# "index"
|
34
|
+
# 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
35
|
+
# 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
|
36
|
+
# 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
37
|
+
# "appendix"
|
38
|
+
# 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
39
|
+
# 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
|
40
|
+
# 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
41
|
+
# "glossary"
|
42
|
+
# 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
|
43
|
+
#
|
44
|
+
# 数字描述:
|
45
|
+
# 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
|
46
|
+
module HeaderDetect
|
47
|
+
extend self
|
48
|
+
|
49
|
+
HEAD_TYPES = [:volume,:part,:chapter,:section,:preface,:appendix,:index,:glossary]
|
50
|
+
|
51
|
+
# 判断包含完整的句子。
|
52
|
+
def hav_complete_sentence?(text)
|
53
|
+
text = text.gsub(/^\d+(\.\d)*\s/,'')
|
54
|
+
text =~ /[\.。!\?!?]/
|
55
|
+
end
|
56
|
+
|
57
|
+
def guess_volume?(text,options={})
|
58
|
+
return false if hav_complete_sentence?(text)
|
59
|
+
return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
|
60
|
+
text = text.downcase
|
61
|
+
return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
62
|
+
end
|
63
|
+
|
64
|
+
def guess_part?(text,options={})
|
65
|
+
return false if hav_complete_sentence?(text)
|
66
|
+
return true if text =~ /^第.{1,3}[部篇]/
|
67
|
+
text = text.downcase
|
68
|
+
return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
69
|
+
end
|
70
|
+
|
71
|
+
def guess_chapter?(text)
|
72
|
+
return false if hav_complete_sentence?(text)
|
73
|
+
return true if text =~ /^第.{1,4}[章回]/
|
74
|
+
text = text.downcase
|
75
|
+
return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
76
|
+
end
|
77
|
+
|
78
|
+
def guess_section?(text)
|
79
|
+
return false if hav_complete_sentence?(text)
|
80
|
+
return true if text =~ /^第.{1,3}[节]/
|
81
|
+
end
|
82
|
+
|
83
|
+
def guess_preface?(text)
|
84
|
+
return false if hav_complete_sentence?(text)
|
85
|
+
return true if text =~ /^前\s*言$/
|
86
|
+
return true if text =~ /^序\s*言$/
|
87
|
+
return true if text =~ /^序$/
|
88
|
+
return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
89
|
+
text = text.downcase
|
90
|
+
return true if text =~ /^preface$/
|
91
|
+
return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
92
|
+
return true if text =~ /^foreword$/
|
93
|
+
return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
94
|
+
end
|
95
|
+
|
96
|
+
def guess_index?(text)
|
97
|
+
return false if hav_complete_sentence?(text)
|
98
|
+
return true if text =~ /^索\s*引$/
|
99
|
+
return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
100
|
+
text = text.downcase
|
101
|
+
return true if text =~ /^index$/
|
102
|
+
return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
103
|
+
end
|
104
|
+
|
105
|
+
def guess_appendix?(text)
|
106
|
+
return false if hav_complete_sentence?(text)
|
107
|
+
return true if text =~ /^附\s*录$/
|
108
|
+
return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
|
109
|
+
text = text.downcase
|
110
|
+
return true if text =~ /^appendix$/
|
111
|
+
return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
|
112
|
+
end
|
113
|
+
|
114
|
+
def guess_glossary?(text)
|
115
|
+
return false if hav_complete_sentence?(text)
|
116
|
+
return true if text =~ /^术\s*语$/
|
117
|
+
return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
118
|
+
text = text.downcase
|
119
|
+
return true if text =~ /^glossary$/
|
120
|
+
return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
|
121
|
+
end
|
122
|
+
|
123
|
+
def guess_digital_section?(text)
|
124
|
+
return false if hav_complete_sentence?(text)
|
125
|
+
matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
|
126
|
+
if matcher
|
127
|
+
return false if matcher[2].length == 0
|
128
|
+
level = matcher[0].split(".").count - 1
|
129
|
+
"sect#{level}".to_sym
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def guess_digital_header?(text)
|
134
|
+
return false if hav_complete_sentence?(text)
|
135
|
+
matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
|
136
|
+
if matcher
|
137
|
+
return false if matcher[3].length == 0
|
138
|
+
levels = matcher[1].split(".")
|
139
|
+
return false if levels[0].to_i > 99
|
140
|
+
case levels.count
|
141
|
+
when 1
|
142
|
+
"chapter".to_sym
|
143
|
+
else
|
144
|
+
"sect#{levels.count - 1}".to_sym
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def guess_header?(text)
|
150
|
+
return :volume if guess_volume?(text)
|
151
|
+
return :part if guess_part?(text)
|
152
|
+
return :chapter if guess_chapter?(text)
|
153
|
+
return :section if guess_section?(text)
|
154
|
+
return :preface if guess_preface?(text)
|
155
|
+
return :appendix if guess_appendix?(text)
|
156
|
+
return :index if guess_index?(text)
|
157
|
+
return :glossary if guess_glossary?(text)
|
158
|
+
return :section if guess_digital_section?(text)
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
data/lib/pdf.rb
ADDED
@@ -0,0 +1,265 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'poppler'
|
3
|
+
require 'pdf-reader'
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
extend self
|
7
|
+
include Utils
|
8
|
+
# scan_pdf?
|
9
|
+
# 检查指定的文件是否为扫描版pdf
|
10
|
+
# parameters:
|
11
|
+
# +filename+ pdf文件
|
12
|
+
def scan_pdf?(filename)
|
13
|
+
if File.extname(filename).downcase == '.pdf'
|
14
|
+
threshold = 1000
|
15
|
+
pdf = Poppler::Document.new(filename)
|
16
|
+
content = pdf.map{|page| page.get_text}.join('')
|
17
|
+
content.strip.length < threshold ? true : false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# extract_pdf_pages_text
|
22
|
+
# 提取pdf中页文本内容
|
23
|
+
# parameters:
|
24
|
+
# +filename+ pdf文件
|
25
|
+
def extract_pdf_pages_text(filename)
|
26
|
+
pdf = PDF::Reader.new(filename)
|
27
|
+
pages = []
|
28
|
+
|
29
|
+
pdf.pages.each do |page|
|
30
|
+
pages << page.text
|
31
|
+
end
|
32
|
+
pages
|
33
|
+
end
|
34
|
+
|
35
|
+
# sanitize_page_header_and_footer
|
36
|
+
# 清洗页眉页脚
|
37
|
+
# parameters:
|
38
|
+
# +pdf_pages_text+ pdf文件页文本内容集合
|
39
|
+
# +options+ 可选参数
|
40
|
+
# :header_rows_count 指定页眉行数
|
41
|
+
# :footer_rows_count 指定页脚行数
|
42
|
+
def sanitize_page_header_and_footer(pdf_pages_text,options={})
|
43
|
+
header_rows_count = options[:header_rows_count] || guess_header_row_count(pdf_pages_text)
|
44
|
+
footer_rows_count = options[:footer_rows_count] || guess_footer_row_count(pdf_pages_text)
|
45
|
+
pages_text = []
|
46
|
+
pdf_pages_text.each do |page_text|
|
47
|
+
page_lines = page_text.split("\n")
|
48
|
+
page_lines = page_lines[(header_rows_count)..(-footer_rows_count-1)] || []
|
49
|
+
pages_text << page_lines.join("\n")
|
50
|
+
end
|
51
|
+
pages_text
|
52
|
+
end
|
53
|
+
|
54
|
+
# extract_pdf_meta
|
55
|
+
# 提取pdf元数据
|
56
|
+
# parameters:
|
57
|
+
# +filename+ pdf文件
|
58
|
+
def extract_pdf_meta(filename)
|
59
|
+
pdf = Poppler::Document.new(filename)
|
60
|
+
meta ={}
|
61
|
+
meta[:author] = pdf.author
|
62
|
+
meta[:title] = pdf.title
|
63
|
+
meta
|
64
|
+
end
|
65
|
+
|
66
|
+
# extract_sections
|
67
|
+
# 提取pdf文件的大纲
|
68
|
+
# parameters:
|
69
|
+
# +filename+ pdf文件
|
70
|
+
def extract_sections(filename)
|
71
|
+
sections = []
|
72
|
+
pdf = Poppler::Document.new(filename)
|
73
|
+
indexer = Poppler::IndexIter.new(pdf)
|
74
|
+
walk_index(indexer,sections)
|
75
|
+
sections
|
76
|
+
rescue
|
77
|
+
sections
|
78
|
+
end
|
79
|
+
|
80
|
+
# extract_illustrations
|
81
|
+
# 提取pdf文件中的插图
|
82
|
+
# parameters:
|
83
|
+
# +filename+ pdf文件
|
84
|
+
# +options+ 可选参数
|
85
|
+
# +dir+ 插图存放的目录,默认存放在当前目录下与filename同名的子目录下。
|
86
|
+
def extract_illustrations(filename,options={})
|
87
|
+
tmp_dir = options[:dir] || File.basename(filename,'.pdf')
|
88
|
+
old_dir = Dir.getwd
|
89
|
+
Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir)
|
90
|
+
system("pdfimages -p '#{filename}' '#{tmp_dir}/'")
|
91
|
+
system("mogrify -format png '#{tmp_dir}/*.ppm'")
|
92
|
+
Dir.chdir(tmp_dir)
|
93
|
+
images = Dir.glob('*.png')
|
94
|
+
images_path = []
|
95
|
+
images.each do |image|
|
96
|
+
images_path << image
|
97
|
+
end
|
98
|
+
Dir.chdir(old_dir)
|
99
|
+
images_path
|
100
|
+
end
|
101
|
+
|
102
|
+
def fixed_break_with_pages_text(pages_text)
|
103
|
+
line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5
|
104
|
+
pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) }
|
105
|
+
pages_text = fixed_break_of_cross_page(pages_text,line_length)
|
106
|
+
end
|
107
|
+
|
108
|
+
def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations)
|
109
|
+
if sections.empty?
|
110
|
+
gen_html_from_page_texts(page_texts,illustrations)
|
111
|
+
else
|
112
|
+
gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确,暂时不进行处理
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def extract_page_illustrations(illustrations,index)
|
117
|
+
page_illustrations = []
|
118
|
+
illustrations.each do |image_path|
|
119
|
+
if image_path.split("-")[1].to_i == index
|
120
|
+
page_illustrations << image_path
|
121
|
+
end
|
122
|
+
end
|
123
|
+
page_illustrations
|
124
|
+
end
|
125
|
+
|
126
|
+
def gen_html_from_page_texts(page_texts,illustrations,options={})
|
127
|
+
page_htmls = []
|
128
|
+
page_texts.each_with_index do |page_text,index|
|
129
|
+
page_illustrations = extract_page_illustrations(illustrations,index)
|
130
|
+
page_htmls << gen_html_from_page_text(page_text,page_illustrations,options.merge(:index=>index))
|
131
|
+
end
|
132
|
+
page_htmls.join("")
|
133
|
+
end
|
134
|
+
|
135
|
+
def gen_html_from_page_text(page_text,illustrations,options={})
|
136
|
+
html = ''
|
137
|
+
page_text.split("\n").each_with_index do |line,index|
|
138
|
+
if line.present?
|
139
|
+
if HeaderDetect.guess_header?(line)
|
140
|
+
html += "<h2 id='#{options[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
|
141
|
+
else
|
142
|
+
html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("")
|
148
|
+
"<div class='page' name='#{options[:index]}' >#{html}#{images}</div>"
|
149
|
+
end
|
150
|
+
|
151
|
+
def walk_index(indexer,sections)
|
152
|
+
indexer.each_with_index do |i,index|
|
153
|
+
sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num}
|
154
|
+
child = i.child
|
155
|
+
if child.nil? == false
|
156
|
+
sub_sections = []
|
157
|
+
work_index(child,sub_sections)
|
158
|
+
sections[index][:sub_sections] = sub_sections
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def work_index(child,sections)
|
164
|
+
child.each_with_index do |h,index|
|
165
|
+
sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num}
|
166
|
+
sub = h.child
|
167
|
+
if sub.nil? == false
|
168
|
+
sub_sections = []
|
169
|
+
work_index(sub,sub_sections)
|
170
|
+
sections[index][:sub_sections] = sub_sections
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def fixed_break_of_cross_page(pages,length=80)
|
176
|
+
i=0
|
177
|
+
while i < (pages.count-1)
|
178
|
+
first_page_lines = pages[i].split("\n")
|
179
|
+
second_page_lines = pages[i+1].split("\n")
|
180
|
+
if first_page_lines.any? && second_page_lines.any?
|
181
|
+
first_page_last = first_page_lines.last
|
182
|
+
second_page_first = second_page_lines.first
|
183
|
+
|
184
|
+
unless Utils.end_mark?(first_page_last)
|
185
|
+
first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first)
|
186
|
+
second_page_lines.shift
|
187
|
+
pages[i] = first_page_lines.join("\n")
|
188
|
+
pages[i+1] = second_page_lines.join("\n")
|
189
|
+
end
|
190
|
+
end
|
191
|
+
i = i + 1
|
192
|
+
end
|
193
|
+
pages
|
194
|
+
end
|
195
|
+
|
196
|
+
# 猜测页眉/页脚的行数
|
197
|
+
# 页眉页脚有一定的规律:
|
198
|
+
# 1. 页眉和页脚一般都在每页的固定位置出现或者对称出现(相邻两页左右位置堆成)
|
199
|
+
# 2. 呈现的内容一般是书名、章节名、页码等。
|
200
|
+
# 3. 呈现的顺序一般有两种形式:逐页式,即每一页的页眉页脚大致相似;隔页式
|
201
|
+
def guess_header_row_count(pages_text)
|
202
|
+
i = 0
|
203
|
+
while true
|
204
|
+
if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]})
|
205
|
+
i = i + 1
|
206
|
+
else
|
207
|
+
break
|
208
|
+
end
|
209
|
+
end
|
210
|
+
i > 2 ? 0 : i
|
211
|
+
end
|
212
|
+
|
213
|
+
def guess_footer_row_count(pages_text)
|
214
|
+
i = 0
|
215
|
+
while true
|
216
|
+
lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]}
|
217
|
+
if guess_footer_line?(lines)
|
218
|
+
i = i + 1
|
219
|
+
else
|
220
|
+
break
|
221
|
+
end
|
222
|
+
end
|
223
|
+
i > 2 ? 0 : i
|
224
|
+
end
|
225
|
+
|
226
|
+
# 猜测是否是页眉/页脚行
|
227
|
+
# 猜测规则:
|
228
|
+
# 1. 相邻页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
|
229
|
+
# 2. 隔页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
|
230
|
+
# 3. 页码猜测,页的行是数值则加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
|
231
|
+
def guess_header_line?(lines)
|
232
|
+
return false if lines.empty?
|
233
|
+
|
234
|
+
lines = lines.map{|line| line.strip if line.present?}
|
235
|
+
similarity_set = []
|
236
|
+
lines.each_with_index do |line,index|
|
237
|
+
if Utils.text_similarity(line,lines[index+1]) > 0.7
|
238
|
+
similarity_set << [index,index+1]
|
239
|
+
end
|
240
|
+
end
|
241
|
+
similarity_set.flatten!
|
242
|
+
similarity_set.uniq!
|
243
|
+
|
244
|
+
return true if similarity_set.count.to_f / lines.count.to_f > 0.5
|
245
|
+
|
246
|
+
similarity_set = []
|
247
|
+
lines.each_with_index do |line,index|
|
248
|
+
if Utils.text_similarity(line,lines[index+2]) > 0.7
|
249
|
+
similarity_set << [index,index+2]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
similarity_set.flatten!
|
253
|
+
similarity_set.uniq!
|
254
|
+
return true if similarity_set.count.to_f / lines.count.to_f > 0.5
|
255
|
+
|
256
|
+
similarity_set=[]
|
257
|
+
lines.each_with_index do |line,index|
|
258
|
+
similarity_set << index if line.to_i > 0
|
259
|
+
end
|
260
|
+
return true if similarity_set.count.to_f / lines.count.to_f > 0.5
|
261
|
+
|
262
|
+
false
|
263
|
+
end
|
264
|
+
alias guess_footer_line? guess_header_line?
|
265
|
+
end
|
data/lib/txt.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module TXT
|
5
|
+
extend self
|
6
|
+
|
7
|
+
def extract_book_part(filename)
|
8
|
+
content = File.open(filename).read
|
9
|
+
|
10
|
+
return nil,nil if content.nil?
|
11
|
+
|
12
|
+
content = clean_forward_blank(content)
|
13
|
+
|
14
|
+
title,content = extract_title_and_content(content,:title=>File.basename(filename,'.txt'))
|
15
|
+
outlines,content = extract_outlines_and_content(content)
|
16
|
+
|
17
|
+
[title,outlines,content]
|
18
|
+
end
|
19
|
+
|
20
|
+
def clean_forward_blank(content)
|
21
|
+
begin
|
22
|
+
content = content.gsub("\r","")
|
23
|
+
rescue
|
24
|
+
content = Utils.to_utf8(content)
|
25
|
+
if content.nil?
|
26
|
+
return ''
|
27
|
+
else
|
28
|
+
content = content.gsub("\r","")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
lines = content.split(/\n/)
|
32
|
+
|
33
|
+
while line = lines.shift
|
34
|
+
break if line.present?
|
35
|
+
end
|
36
|
+
|
37
|
+
if line.present?
|
38
|
+
lines.unshift(line)
|
39
|
+
lines.join("\n")
|
40
|
+
else
|
41
|
+
""
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_title_and_content(content,options={})
|
46
|
+
title = options[:title] || ''
|
47
|
+
lines = content.split("\n")
|
48
|
+
item = lines.shift
|
49
|
+
if Utils.text_similarity(item,title) > 0.8
|
50
|
+
title = item
|
51
|
+
else
|
52
|
+
lines.unshift(item)
|
53
|
+
end
|
54
|
+
[title, lines.join("\n")]
|
55
|
+
end
|
56
|
+
|
57
|
+
def extract_outlines_and_content(content)
|
58
|
+
lines = content.split(/\n/)
|
59
|
+
outlines = []
|
60
|
+
while line = lines.shift
|
61
|
+
if HeaderDetect.guess_header?(line)
|
62
|
+
outlines << line
|
63
|
+
else
|
64
|
+
break
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
lines.unshift(line) if line.present?
|
69
|
+
|
70
|
+
if outlines.count > 1
|
71
|
+
[outlines.join("\n"),lines.join("\n")]
|
72
|
+
else
|
73
|
+
content = (outlines + lines).join("\n")
|
74
|
+
[nil,content]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def gen_html_from_txt_book(title,outlines,content,options={})
|
79
|
+
html = "<h1>#{title}</h1>"
|
80
|
+
html = html + gen_html_from_txt_outlines(outlines,options)
|
81
|
+
html = html + gen_html_from_txt_content(content,options)
|
82
|
+
end
|
83
|
+
|
84
|
+
def gen_html_from_txt_outlines(outlines,options={})
|
85
|
+
if outlines.present?
|
86
|
+
html = outlines.split("\n").map{|item|
|
87
|
+
"<li>#{item}</li>"
|
88
|
+
}
|
89
|
+
"<ol class='outlines'>#{html}</ol>"
|
90
|
+
else
|
91
|
+
''
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def gen_html_from_txt_content(content,options={})
|
96
|
+
row_index = 0
|
97
|
+
html = content.split("\n").map do |line|
|
98
|
+
row_index += 1
|
99
|
+
if line.present?
|
100
|
+
if HeaderDetect.guess_header?(line)
|
101
|
+
"<h2 id='#{row_index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
|
102
|
+
else
|
103
|
+
"<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end.compact.join("")
|
107
|
+
end
|
108
|
+
end
|