ebook_tools 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,161 @@
1
+ # encoding: utf-8
2
+ # HeaderDetect
3
+ # HeaderDetect模块提供对标题的检测
4
+ #
5
+ # 文档结构信息分析
6
+ # 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
7
+ # 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
8
+ # 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
9
+ # 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
10
+ # 根据不同的类型,对结构信息的提取采用不同的处理手段。
11
+ #
12
+ # 有效的标题信息应该符合以下规则:
13
+ # 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
14
+ # 2. 应该包含结构信息表述,具体如下:
15
+ # 文本描述:
16
+ # 卷: 以"第xxx卷"开始
17
+ # 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
18
+ # 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
19
+ # 部分(篇): 以"第xxx部"或"第xxx篇"开始
20
+ # 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
21
+ # 章(回): 以"第xxx章"或"第xxx回"开始
22
+ # 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
23
+ # 节: 以"第xxx节"开始
24
+ # 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
25
+ # 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
26
+ # 单个"序"
27
+ # 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
28
+ # "preface"
29
+ # "foreword"
30
+ # 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
31
+ # 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
32
+ # 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
33
+ # "index"
34
+ # 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
35
+ # 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
36
+ # 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
37
+ # "appendix"
38
+ # 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
39
+ # 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
40
+ # 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
41
+ # "glossary"
42
+ # 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
43
+ #
44
+ # 数字描述:
45
+ # 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
46
+ module HeaderDetect
47
+ extend self
48
+
49
+ HEAD_TYPES = [:volume,:part,:chapter,:section,:preface,:appendix,:index,:glossary]
50
+
51
+ # 判断包含完整的句子。
52
+ def hav_complete_sentence?(text)
53
+ text = text.gsub(/^\d+(\.\d)*\s/,'')
54
+ text =~ /[\.。!\?!?]/
55
+ end
56
+
57
+ def guess_volume?(text,options={})
58
+ return false if hav_complete_sentence?(text)
59
+ return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
60
+ text = text.downcase
61
+ return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
62
+ end
63
+
64
+ def guess_part?(text,options={})
65
+ return false if hav_complete_sentence?(text)
66
+ return true if text =~ /^第.{1,3}[部篇]/
67
+ text = text.downcase
68
+ return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
69
+ end
70
+
71
+ def guess_chapter?(text)
72
+ return false if hav_complete_sentence?(text)
73
+ return true if text =~ /^第.{1,4}[章回]/
74
+ text = text.downcase
75
+ return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
76
+ end
77
+
78
+ def guess_section?(text)
79
+ return false if hav_complete_sentence?(text)
80
+ return true if text =~ /^第.{1,3}[节]/
81
+ end
82
+
83
+ def guess_preface?(text)
84
+ return false if hav_complete_sentence?(text)
85
+ return true if text =~ /^前\s*言$/
86
+ return true if text =~ /^序\s*言$/
87
+ return true if text =~ /^序$/
88
+ return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
89
+ text = text.downcase
90
+ return true if text =~ /^preface$/
91
+ return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
92
+ return true if text =~ /^foreword$/
93
+ return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
94
+ end
95
+
96
+ def guess_index?(text)
97
+ return false if hav_complete_sentence?(text)
98
+ return true if text =~ /^索\s*引$/
99
+ return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
100
+ text = text.downcase
101
+ return true if text =~ /^index$/
102
+ return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
103
+ end
104
+
105
+ def guess_appendix?(text)
106
+ return false if hav_complete_sentence?(text)
107
+ return true if text =~ /^附\s*录$/
108
+ return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
109
+ text = text.downcase
110
+ return true if text =~ /^appendix$/
111
+ return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
112
+ end
113
+
114
+ def guess_glossary?(text)
115
+ return false if hav_complete_sentence?(text)
116
+ return true if text =~ /^术\s*语$/
117
+ return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
118
+ text = text.downcase
119
+ return true if text =~ /^glossary$/
120
+ return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
121
+ end
122
+
123
+ def guess_digital_section?(text)
124
+ return false if hav_complete_sentence?(text)
125
+ matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
126
+ if matcher
127
+ return false if matcher[2].length == 0
128
+ level = matcher[0].split(".").count - 1
129
+ "sect#{level}".to_sym
130
+ end
131
+ end
132
+
133
+ def guess_digital_header?(text)
134
+ return false if hav_complete_sentence?(text)
135
+ matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
136
+ if matcher
137
+ return false if matcher[3].length == 0
138
+ levels = matcher[1].split(".")
139
+ return false if levels[0].to_i > 99
140
+ case levels.count
141
+ when 1
142
+ "chapter".to_sym
143
+ else
144
+ "sect#{levels.count - 1}".to_sym
145
+ end
146
+ end
147
+ end
148
+
149
+ def guess_header?(text)
150
+ return :volume if guess_volume?(text)
151
+ return :part if guess_part?(text)
152
+ return :chapter if guess_chapter?(text)
153
+ return :section if guess_section?(text)
154
+ return :preface if guess_preface?(text)
155
+ return :appendix if guess_appendix?(text)
156
+ return :index if guess_index?(text)
157
+ return :glossary if guess_glossary?(text)
158
+ return :section if guess_digital_section?(text)
159
+ end
160
+
161
+ end
@@ -0,0 +1,265 @@
1
+ # encoding: UTF-8
2
+ require 'poppler'
3
+ require 'pdf-reader'
4
+
5
+ module PDF
6
+ extend self
7
+ include Utils
8
+ # scan_pdf?
9
+ # 检查指定的文件是否为扫描版pdf
10
+ # parameters:
11
+ # +filename+ pdf文件
12
+ def scan_pdf?(filename)
13
+ if File.extname(filename).downcase == '.pdf'
14
+ threshold = 1000
15
+ pdf = Poppler::Document.new(filename)
16
+ content = pdf.map{|page| page.get_text}.join('')
17
+ content.strip.length < threshold ? true : false
18
+ end
19
+ end
20
+
21
+ # extract_pdf_pages_text
22
+ # 提取pdf中页文本内容
23
+ # parameters:
24
+ # +filename+ pdf文件
25
+ def extract_pdf_pages_text(filename)
26
+ pdf = PDF::Reader.new(filename)
27
+ pages = []
28
+
29
+ pdf.pages.each do |page|
30
+ pages << page.text
31
+ end
32
+ pages
33
+ end
34
+
35
+ # sanitize_page_header_and_footer
36
+ # 清洗页眉页脚
37
+ # parameters:
38
+ # +pdf_pages_text+ pdf文件页文本内容集合
39
+ # +options+ 可选参数
40
+ # :header_rows_count 指定页眉行数
41
+ # :footer_rows_count 指定页脚行数
42
+ def sanitize_page_header_and_footer(pdf_pages_text,options={})
43
+ header_rows_count = options[:header_rows_count] || guess_header_row_count(pdf_pages_text)
44
+ footer_rows_count = options[:footer_rows_count] || guess_footer_row_count(pdf_pages_text)
45
+ pages_text = []
46
+ pdf_pages_text.each do |page_text|
47
+ page_lines = page_text.split("\n")
48
+ page_lines = page_lines[(header_rows_count)..(-footer_rows_count-1)] || []
49
+ pages_text << page_lines.join("\n")
50
+ end
51
+ pages_text
52
+ end
53
+
54
+ # extract_pdf_meta
55
+ # 提取pdf元数据
56
+ # parameters:
57
+ # +filename+ pdf文件
58
+ def extract_pdf_meta(filename)
59
+ pdf = Poppler::Document.new(filename)
60
+ meta ={}
61
+ meta[:author] = pdf.author
62
+ meta[:title] = pdf.title
63
+ meta
64
+ end
65
+
66
+ # extract_sections
67
+ # 提取pdf文件的大纲
68
+ # parameters:
69
+ # +filename+ pdf文件
70
+ def extract_sections(filename)
71
+ sections = []
72
+ pdf = Poppler::Document.new(filename)
73
+ indexer = Poppler::IndexIter.new(pdf)
74
+ walk_index(indexer,sections)
75
+ sections
76
+ rescue
77
+ sections
78
+ end
79
+
80
+ # extract_illustrations
81
+ # 提取pdf文件中的插图
82
+ # parameters:
83
+ # +filename+ pdf文件
84
+ # +options+ 可选参数
85
+ # +dir+ 插图存放的目录,默认存放在当前目录下与filename同名的子目录下。
86
+ def extract_illustrations(filename,options={})
87
+ tmp_dir = options[:dir] || File.basename(filename,'.pdf')
88
+ old_dir = Dir.getwd
89
+ Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir)
90
+ system("pdfimages -p '#{filename}' '#{tmp_dir}/'")
91
+ system("mogrify -format png '#{tmp_dir}/*.ppm'")
92
+ Dir.chdir(tmp_dir)
93
+ images = Dir.glob('*.png')
94
+ images_path = []
95
+ images.each do |image|
96
+ images_path << image
97
+ end
98
+ Dir.chdir(old_dir)
99
+ images_path
100
+ end
101
+
102
+ def fixed_break_with_pages_text(pages_text)
103
+ line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5
104
+ pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) }
105
+ pages_text = fixed_break_of_cross_page(pages_text,line_length)
106
+ end
107
+
108
+ def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations)
109
+ if sections.empty?
110
+ gen_html_from_page_texts(page_texts,illustrations)
111
+ else
112
+ gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确,暂时不进行处理
113
+ end
114
+ end
115
+
116
+ def extract_page_illustrations(illustrations,index)
117
+ page_illustrations = []
118
+ illustrations.each do |image_path|
119
+ if image_path.split("-")[1].to_i == index
120
+ page_illustrations << image_path
121
+ end
122
+ end
123
+ page_illustrations
124
+ end
125
+
126
+ def gen_html_from_page_texts(page_texts,illustrations,options={})
127
+ page_htmls = []
128
+ page_texts.each_with_index do |page_text,index|
129
+ page_illustrations = extract_page_illustrations(illustrations,index)
130
+ page_htmls << gen_html_from_page_text(page_text,page_illustrations,options.merge(:index=>index))
131
+ end
132
+ page_htmls.join("")
133
+ end
134
+
135
+ def gen_html_from_page_text(page_text,illustrations,options={})
136
+ html = ''
137
+ page_text.split("\n").each_with_index do |line,index|
138
+ if line.present?
139
+ if HeaderDetect.guess_header?(line)
140
+ html += "<h2 id='#{options[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
141
+ else
142
+ html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>"
143
+ end
144
+ end
145
+ end
146
+
147
+ images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("")
148
+ "<div class='page' name='#{options[:index]}' >#{html}#{images}</div>"
149
+ end
150
+
151
+ def walk_index(indexer,sections)
152
+ indexer.each_with_index do |i,index|
153
+ sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num}
154
+ child = i.child
155
+ if child.nil? == false
156
+ sub_sections = []
157
+ work_index(child,sub_sections)
158
+ sections[index][:sub_sections] = sub_sections
159
+ end
160
+ end
161
+ end
162
+
163
+ def work_index(child,sections)
164
+ child.each_with_index do |h,index|
165
+ sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num}
166
+ sub = h.child
167
+ if sub.nil? == false
168
+ sub_sections = []
169
+ work_index(sub,sub_sections)
170
+ sections[index][:sub_sections] = sub_sections
171
+ end
172
+ end
173
+ end
174
+
175
+ def fixed_break_of_cross_page(pages,length=80)
176
+ i=0
177
+ while i < (pages.count-1)
178
+ first_page_lines = pages[i].split("\n")
179
+ second_page_lines = pages[i+1].split("\n")
180
+ if first_page_lines.any? && second_page_lines.any?
181
+ first_page_last = first_page_lines.last
182
+ second_page_first = second_page_lines.first
183
+
184
+ unless Utils.end_mark?(first_page_last)
185
+ first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first)
186
+ second_page_lines.shift
187
+ pages[i] = first_page_lines.join("\n")
188
+ pages[i+1] = second_page_lines.join("\n")
189
+ end
190
+ end
191
+ i = i + 1
192
+ end
193
+ pages
194
+ end
195
+
196
+ # 猜测页眉/页脚的行数
197
+ # 页眉页脚有一定的规律:
198
+ # 1. 页眉和页脚一般都在每页的固定位置出现或者对称出现(相邻两页左右位置堆成)
199
+ # 2. 呈现的内容一般是书名、章节名、页码等。
200
+ # 3. 呈现的顺序一般有两种形式:逐页式,即每一页的页眉页脚大致相似;隔页式
201
+ def guess_header_row_count(pages_text)
202
+ i = 0
203
+ while true
204
+ if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]})
205
+ i = i + 1
206
+ else
207
+ break
208
+ end
209
+ end
210
+ i > 2 ? 0 : i
211
+ end
212
+
213
+ def guess_footer_row_count(pages_text)
214
+ i = 0
215
+ while true
216
+ lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]}
217
+ if guess_footer_line?(lines)
218
+ i = i + 1
219
+ else
220
+ break
221
+ end
222
+ end
223
+ i > 2 ? 0 : i
224
+ end
225
+
226
+ # 猜测是否是页眉/页脚行
227
+ # 猜测规则:
228
+ # 1. 相邻页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
229
+ # 2. 隔页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
230
+ # 3. 页码猜测,页的行是数值则加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
231
+ def guess_header_line?(lines)
232
+ return false if lines.empty?
233
+
234
+ lines = lines.map{|line| line.strip if line.present?}
235
+ similarity_set = []
236
+ lines.each_with_index do |line,index|
237
+ if Utils.text_similarity(line,lines[index+1]) > 0.7
238
+ similarity_set << [index,index+1]
239
+ end
240
+ end
241
+ similarity_set.flatten!
242
+ similarity_set.uniq!
243
+
244
+ return true if similarity_set.count.to_f / lines.count.to_f > 0.5
245
+
246
+ similarity_set = []
247
+ lines.each_with_index do |line,index|
248
+ if Utils.text_similarity(line,lines[index+2]) > 0.7
249
+ similarity_set << [index,index+2]
250
+ end
251
+ end
252
+ similarity_set.flatten!
253
+ similarity_set.uniq!
254
+ return true if similarity_set.count.to_f / lines.count.to_f > 0.5
255
+
256
+ similarity_set=[]
257
+ lines.each_with_index do |line,index|
258
+ similarity_set << index if line.to_i > 0
259
+ end
260
+ return true if similarity_set.count.to_f / lines.count.to_f > 0.5
261
+
262
+ false
263
+ end
264
+ alias guess_footer_line? guess_header_line?
265
+ end
@@ -0,0 +1,108 @@
1
+ # encoding: UTF-8
2
+ require 'nokogiri'
3
+
4
+ module TXT
5
+ extend self
6
+
7
+ def extract_book_part(filename)
8
+ content = File.open(filename).read
9
+
10
+ return nil,nil if content.nil?
11
+
12
+ content = clean_forward_blank(content)
13
+
14
+ title,content = extract_title_and_content(content,:title=>File.basename(filename,'.txt'))
15
+ outlines,content = extract_outlines_and_content(content)
16
+
17
+ [title,outlines,content]
18
+ end
19
+
20
+ def clean_forward_blank(content)
21
+ begin
22
+ content = content.gsub("\r","")
23
+ rescue
24
+ content = Utils.to_utf8(content)
25
+ if content.nil?
26
+ return ''
27
+ else
28
+ content = content.gsub("\r","")
29
+ end
30
+ end
31
+ lines = content.split(/\n/)
32
+
33
+ while line = lines.shift
34
+ break if line.present?
35
+ end
36
+
37
+ if line.present?
38
+ lines.unshift(line)
39
+ lines.join("\n")
40
+ else
41
+ ""
42
+ end
43
+ end
44
+
45
+ def extract_title_and_content(content,options={})
46
+ title = options[:title] || ''
47
+ lines = content.split("\n")
48
+ item = lines.shift
49
+ if Utils.text_similarity(item,title) > 0.8
50
+ title = item
51
+ else
52
+ lines.unshift(item)
53
+ end
54
+ [title, lines.join("\n")]
55
+ end
56
+
57
+ def extract_outlines_and_content(content)
58
+ lines = content.split(/\n/)
59
+ outlines = []
60
+ while line = lines.shift
61
+ if HeaderDetect.guess_header?(line)
62
+ outlines << line
63
+ else
64
+ break
65
+ end
66
+ end
67
+
68
+ lines.unshift(line) if line.present?
69
+
70
+ if outlines.count > 1
71
+ [outlines.join("\n"),lines.join("\n")]
72
+ else
73
+ content = (outlines + lines).join("\n")
74
+ [nil,content]
75
+ end
76
+ end
77
+
78
+ def gen_html_from_txt_book(title,outlines,content,options={})
79
+ html = "<h1>#{title}</h1>"
80
+ html = html + gen_html_from_txt_outlines(outlines,options)
81
+ html = html + gen_html_from_txt_content(content,options)
82
+ end
83
+
84
+ def gen_html_from_txt_outlines(outlines,options={})
85
+ if outlines.present?
86
+ html = outlines.split("\n").map{|item|
87
+ "<li>#{item}</li>"
88
+ }
89
+ "<ol class='outlines'>#{html}</ol>"
90
+ else
91
+ ''
92
+ end
93
+ end
94
+
95
+ def gen_html_from_txt_content(content,options={})
96
+ row_index = 0
97
+ html = content.split("\n").map do |line|
98
+ row_index += 1
99
+ if line.present?
100
+ if HeaderDetect.guess_header?(line)
101
+ "<h2 id='#{row_index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
102
+ else
103
+ "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>"
104
+ end
105
+ end
106
+ end.compact.join("")
107
+ end
108
+ end