ebook_tools 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,161 @@
1
+ # encoding: utf-8
2
+ # HeaderDetect
3
+ # HeaderDetect模块提供对标题的检测
4
+ #
5
+ # 文档结构信息分析
6
+ # 一本书在编排的时候会有自己的结构信息,这些结构信息通常通过卷、篇、部分、章(回)节等表述,也会使用序号的方式表述。总体上可以分为以下几种:
7
+ # 1. 文本描述(text): 按卷、部分(篇)、章(回)、节等文字表述
8
+ # 2. 数字描述(digital): 所有结构信息都是按照数字序号表示,比如 1 xxxxx; 1.1 xxxxx
9
+ # 3. 混合描述(hybrid):章按照文字表述,节按照序号表示,比如 1.1 xxxxxx
10
+ # 根据不同的类型,对结构信息的提取采用不同的处理手段。
11
+ #
12
+ # 有效的标题信息应该符合以下规则:
13
+ # 1. 标题应该不包含完整的句子(应该不包含句子分隔符,例如“。","!"等)
14
+ # 2. 应该包含结构信息表述,具体如下:
15
+ # 文本描述:
16
+ # 卷: 以"第xxx卷"开始
17
+ # 以"卷"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
18
+ # 以"volume"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
19
+ # 部分(篇): 以"第xxx部"或"第xxx篇"开始
20
+ # 以"part"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
21
+ # 章(回): 以"第xxx章"或"第xxx回"开始
22
+ # 以"chapter"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
23
+ # 节: 以"第xxx节"开始
24
+ # 前言: 以"前"开始,以"言"结束,中间加入空白字符。例如"前言","前 言"等。
25
+ # 以"序"开始,以"言"结束,中间加入空白字符。例如"序言","序 言"等。
26
+ # 单个"序"
27
+ # 以"序"或"序言"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
28
+ # "preface"
29
+ # "foreword"
30
+ # 以"preface"或"foreword"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
31
+ # 索引: 以"索"开始,以"引"结束,中间加入空白字符。例如"索引","索 引"等。
32
+ # 以"索引"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
33
+ # "index"
34
+ # 以"index"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
35
+ # 附录: 以"附"开始,以"录"结束,中间加入空白字符。例如"附录","附 录"等。
36
+ # 以"附录"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
37
+ # "appendix"
38
+ # 以"appendix"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
39
+ # 术语: 以"术"开始,以"语"结束,中间加入空白字符。例如"术语","术 语"等。
40
+ # 以"术语"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
41
+ # "glossary"
42
+ # 以"glossary"开始,后面跟序号表述方式,例如 “I”,“Ⅱ”,“1”等
43
+ #
44
+ # 数字描述:
45
+ # 以数字序号层级表达,数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
46
+ module HeaderDetect
47
+ extend self
48
+
49
+ HEAD_TYPES = [:volume,:part,:chapter,:section,:preface,:appendix,:index,:glossary]
50
+
51
+ # 判断包含完整的句子。
52
+ def hav_complete_sentence?(text)
53
+ text = text.gsub(/^\d+(\.\d)*\s/,'')
54
+ text =~ /[\.。!\?!?]/
55
+ end
56
+
57
+ def guess_volume?(text,options={})
58
+ return false if hav_complete_sentence?(text)
59
+ return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
60
+ text = text.downcase
61
+ return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
62
+ end
63
+
64
+ def guess_part?(text,options={})
65
+ return false if hav_complete_sentence?(text)
66
+ return true if text =~ /^第.{1,3}[部篇]/
67
+ text = text.downcase
68
+ return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
69
+ end
70
+
71
+ def guess_chapter?(text)
72
+ return false if hav_complete_sentence?(text)
73
+ return true if text =~ /^第.{1,4}[章回]/
74
+ text = text.downcase
75
+ return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
76
+ end
77
+
78
+ def guess_section?(text)
79
+ return false if hav_complete_sentence?(text)
80
+ return true if text =~ /^第.{1,3}[节]/
81
+ end
82
+
83
+ def guess_preface?(text)
84
+ return false if hav_complete_sentence?(text)
85
+ return true if text =~ /^前\s*言$/
86
+ return true if text =~ /^序\s*言$/
87
+ return true if text =~ /^序$/
88
+ return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
89
+ text = text.downcase
90
+ return true if text =~ /^preface$/
91
+ return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
92
+ return true if text =~ /^foreword$/
93
+ return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
94
+ end
95
+
96
+ def guess_index?(text)
97
+ return false if hav_complete_sentence?(text)
98
+ return true if text =~ /^索\s*引$/
99
+ return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
100
+ text = text.downcase
101
+ return true if text =~ /^index$/
102
+ return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
103
+ end
104
+
105
+ def guess_appendix?(text)
106
+ return false if hav_complete_sentence?(text)
107
+ return true if text =~ /^附\s*录$/
108
+ return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
109
+ text = text.downcase
110
+ return true if text =~ /^appendix$/
111
+ return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
112
+ end
113
+
114
+ def guess_glossary?(text)
115
+ return false if hav_complete_sentence?(text)
116
+ return true if text =~ /^术\s*语$/
117
+ return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
118
+ text = text.downcase
119
+ return true if text =~ /^glossary$/
120
+ return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
121
+ end
122
+
123
+ def guess_digital_section?(text)
124
+ return false if hav_complete_sentence?(text)
125
+ matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
126
+ if matcher
127
+ return false if matcher[2].length == 0
128
+ level = matcher[0].split(".").count - 1
129
+ "sect#{level}".to_sym
130
+ end
131
+ end
132
+
133
+ def guess_digital_header?(text)
134
+ return false if hav_complete_sentence?(text)
135
+ matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
136
+ if matcher
137
+ return false if matcher[3].length == 0
138
+ levels = matcher[1].split(".")
139
+ return false if levels[0].to_i > 99
140
+ case levels.count
141
+ when 1
142
+ "chapter".to_sym
143
+ else
144
+ "sect#{levels.count - 1}".to_sym
145
+ end
146
+ end
147
+ end
148
+
149
+ def guess_header?(text)
150
+ return :volume if guess_volume?(text)
151
+ return :part if guess_part?(text)
152
+ return :chapter if guess_chapter?(text)
153
+ return :section if guess_section?(text)
154
+ return :preface if guess_preface?(text)
155
+ return :appendix if guess_appendix?(text)
156
+ return :index if guess_index?(text)
157
+ return :glossary if guess_glossary?(text)
158
+ return :section if guess_digital_section?(text)
159
+ end
160
+
161
+ end
@@ -0,0 +1,265 @@
1
+ # encoding: UTF-8
2
+ require 'poppler'
3
+ require 'pdf-reader'
4
+
5
+ module PDF
6
+ extend self
7
+ include Utils
8
+ # scan_pdf?
9
+ # 检查指定的文件是否为扫描版pdf
10
+ # parameters:
11
+ # +filename+ pdf文件
12
+ def scan_pdf?(filename)
13
+ if File.extname(filename).downcase == '.pdf'
14
+ threshold = 1000
15
+ pdf = Poppler::Document.new(filename)
16
+ content = pdf.map{|page| page.get_text}.join('')
17
+ content.strip.length < threshold ? true : false
18
+ end
19
+ end
20
+
21
+ # extract_pdf_pages_text
22
+ # 提取pdf中页文本内容
23
+ # parameters:
24
+ # +filename+ pdf文件
25
+ def extract_pdf_pages_text(filename)
26
+ pdf = PDF::Reader.new(filename)
27
+ pages = []
28
+
29
+ pdf.pages.each do |page|
30
+ pages << page.text
31
+ end
32
+ pages
33
+ end
34
+
35
+ # sanitize_page_header_and_footer
36
+ # 清洗页眉页脚
37
+ # parameters:
38
+ # +pdf_pages_text+ pdf文件页文本内容集合
39
+ # +options+ 可选参数
40
+ # :header_rows_count 指定页眉行数
41
+ # :footer_rows_count 指定页脚行数
42
+ def sanitize_page_header_and_footer(pdf_pages_text,options={})
43
+ header_rows_count = options[:header_rows_count] || guess_header_row_count(pdf_pages_text)
44
+ footer_rows_count = options[:footer_rows_count] || guess_footer_row_count(pdf_pages_text)
45
+ pages_text = []
46
+ pdf_pages_text.each do |page_text|
47
+ page_lines = page_text.split("\n")
48
+ page_lines = page_lines[(header_rows_count)..(-footer_rows_count-1)] || []
49
+ pages_text << page_lines.join("\n")
50
+ end
51
+ pages_text
52
+ end
53
+
54
+ # extract_pdf_meta
55
+ # 提取pdf元数据
56
+ # parameters:
57
+ # +filename+ pdf文件
58
+ def extract_pdf_meta(filename)
59
+ pdf = Poppler::Document.new(filename)
60
+ meta ={}
61
+ meta[:author] = pdf.author
62
+ meta[:title] = pdf.title
63
+ meta
64
+ end
65
+
66
+ # extract_sections
67
+ # 提取pdf文件的大纲
68
+ # parameters:
69
+ # +filename+ pdf文件
70
+ def extract_sections(filename)
71
+ sections = []
72
+ pdf = Poppler::Document.new(filename)
73
+ indexer = Poppler::IndexIter.new(pdf)
74
+ walk_index(indexer,sections)
75
+ sections
76
+ rescue
77
+ sections
78
+ end
79
+
80
+ # extract_illustrations
81
+ # 提取pdf文件中的插图
82
+ # parameters:
83
+ # +filename+ pdf文件
84
+ # +options+ 可选参数
85
+ # +dir+ 插图存放的目录,默认存放在当前目录下与filename同名的子目录下。
86
+ def extract_illustrations(filename,options={})
87
+ tmp_dir = options[:dir] || File.basename(filename,'.pdf')
88
+ old_dir = Dir.getwd
89
+ Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir)
90
+ system("pdfimages -p '#{filename}' '#{tmp_dir}/'")
91
+ system("mogrify -format png '#{tmp_dir}/*.ppm'")
92
+ Dir.chdir(tmp_dir)
93
+ images = Dir.glob('*.png')
94
+ images_path = []
95
+ images.each do |image|
96
+ images_path << image
97
+ end
98
+ Dir.chdir(old_dir)
99
+ images_path
100
+ end
101
+
102
+ def fixed_break_with_pages_text(pages_text)
103
+ line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5
104
+ pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) }
105
+ pages_text = fixed_break_of_cross_page(pages_text,line_length)
106
+ end
107
+
108
+ def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations)
109
+ if sections.empty?
110
+ gen_html_from_page_texts(page_texts,illustrations)
111
+ else
112
+ gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确,暂时不进行处理
113
+ end
114
+ end
115
+
116
+ def extract_page_illustrations(illustrations,index)
117
+ page_illustrations = []
118
+ illustrations.each do |image_path|
119
+ if image_path.split("-")[1].to_i == index
120
+ page_illustrations << image_path
121
+ end
122
+ end
123
+ page_illustrations
124
+ end
125
+
126
+ def gen_html_from_page_texts(page_texts,illustrations,options={})
127
+ page_htmls = []
128
+ page_texts.each_with_index do |page_text,index|
129
+ page_illustrations = extract_page_illustrations(illustrations,index)
130
+ page_htmls << gen_html_from_page_text(page_text,page_illustrations,options.merge(:index=>index))
131
+ end
132
+ page_htmls.join("")
133
+ end
134
+
135
+ def gen_html_from_page_text(page_text,illustrations,options={})
136
+ html = ''
137
+ page_text.split("\n").each_with_index do |line,index|
138
+ if line.present?
139
+ if HeaderDetect.guess_header?(line)
140
+ html += "<h2 id='#{options[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
141
+ else
142
+ html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>"
143
+ end
144
+ end
145
+ end
146
+
147
+ images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("")
148
+ "<div class='page' name='#{options[:index]}' >#{html}#{images}</div>"
149
+ end
150
+
151
+ def walk_index(indexer,sections)
152
+ indexer.each_with_index do |i,index|
153
+ sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num}
154
+ child = i.child
155
+ if child.nil? == false
156
+ sub_sections = []
157
+ work_index(child,sub_sections)
158
+ sections[index][:sub_sections] = sub_sections
159
+ end
160
+ end
161
+ end
162
+
163
+ def work_index(child,sections)
164
+ child.each_with_index do |h,index|
165
+ sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num}
166
+ sub = h.child
167
+ if sub.nil? == false
168
+ sub_sections = []
169
+ work_index(sub,sub_sections)
170
+ sections[index][:sub_sections] = sub_sections
171
+ end
172
+ end
173
+ end
174
+
175
+ def fixed_break_of_cross_page(pages,length=80)
176
+ i=0
177
+ while i < (pages.count-1)
178
+ first_page_lines = pages[i].split("\n")
179
+ second_page_lines = pages[i+1].split("\n")
180
+ if first_page_lines.any? && second_page_lines.any?
181
+ first_page_last = first_page_lines.last
182
+ second_page_first = second_page_lines.first
183
+
184
+ unless Utils.end_mark?(first_page_last)
185
+ first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first)
186
+ second_page_lines.shift
187
+ pages[i] = first_page_lines.join("\n")
188
+ pages[i+1] = second_page_lines.join("\n")
189
+ end
190
+ end
191
+ i = i + 1
192
+ end
193
+ pages
194
+ end
195
+
196
+ # 猜测页眉/页脚的行数
197
+ # 页眉页脚有一定的规律:
198
+ # 1. 页眉和页脚一般都在每页的固定位置出现或者对称出现(相邻两页左右位置堆成)
199
+ # 2. 呈现的内容一般是书名、章节名、页码等。
200
+ # 3. 呈现的顺序一般有两种形式:逐页式,即每一页的页眉页脚大致相似;隔页式
201
+ def guess_header_row_count(pages_text)
202
+ i = 0
203
+ while true
204
+ if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]})
205
+ i = i + 1
206
+ else
207
+ break
208
+ end
209
+ end
210
+ i > 2 ? 0 : i
211
+ end
212
+
213
+ def guess_footer_row_count(pages_text)
214
+ i = 0
215
+ while true
216
+ lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]}
217
+ if guess_footer_line?(lines)
218
+ i = i + 1
219
+ else
220
+ break
221
+ end
222
+ end
223
+ i > 2 ? 0 : i
224
+ end
225
+
226
+ # 猜测是否是页眉/页脚行
227
+ # 猜测规则:
228
+ # 1. 相邻页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
229
+ # 2. 隔页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
230
+ # 3. 页码猜测,页的行是数值则加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
231
+ def guess_header_line?(lines)
232
+ return false if lines.empty?
233
+
234
+ lines = lines.map{|line| line.strip if line.present?}
235
+ similarity_set = []
236
+ lines.each_with_index do |line,index|
237
+ if Utils.text_similarity(line,lines[index+1]) > 0.7
238
+ similarity_set << [index,index+1]
239
+ end
240
+ end
241
+ similarity_set.flatten!
242
+ similarity_set.uniq!
243
+
244
+ return true if similarity_set.count.to_f / lines.count.to_f > 0.5
245
+
246
+ similarity_set = []
247
+ lines.each_with_index do |line,index|
248
+ if Utils.text_similarity(line,lines[index+2]) > 0.7
249
+ similarity_set << [index,index+2]
250
+ end
251
+ end
252
+ similarity_set.flatten!
253
+ similarity_set.uniq!
254
+ return true if similarity_set.count.to_f / lines.count.to_f > 0.5
255
+
256
+ similarity_set=[]
257
+ lines.each_with_index do |line,index|
258
+ similarity_set << index if line.to_i > 0
259
+ end
260
+ return true if similarity_set.count.to_f / lines.count.to_f > 0.5
261
+
262
+ false
263
+ end
264
+ alias guess_footer_line? guess_header_line?
265
+ end
@@ -0,0 +1,108 @@
1
+ # encoding: UTF-8
2
+ require 'nokogiri'
3
+
4
+ module TXT
5
+ extend self
6
+
7
+ def extract_book_part(filename)
8
+ content = File.open(filename).read
9
+
10
+ return nil,nil if content.nil?
11
+
12
+ content = clean_forward_blank(content)
13
+
14
+ title,content = extract_title_and_content(content,:title=>File.basename(filename,'.txt'))
15
+ outlines,content = extract_outlines_and_content(content)
16
+
17
+ [title,outlines,content]
18
+ end
19
+
20
+ def clean_forward_blank(content)
21
+ begin
22
+ content = content.gsub("\r","")
23
+ rescue
24
+ content = Utils.to_utf8(content)
25
+ if content.nil?
26
+ return ''
27
+ else
28
+ content = content.gsub("\r","")
29
+ end
30
+ end
31
+ lines = content.split(/\n/)
32
+
33
+ while line = lines.shift
34
+ break if line.present?
35
+ end
36
+
37
+ if line.present?
38
+ lines.unshift(line)
39
+ lines.join("\n")
40
+ else
41
+ ""
42
+ end
43
+ end
44
+
45
+ def extract_title_and_content(content,options={})
46
+ title = options[:title] || ''
47
+ lines = content.split("\n")
48
+ item = lines.shift
49
+ if Utils.text_similarity(item,title) > 0.8
50
+ title = item
51
+ else
52
+ lines.unshift(item)
53
+ end
54
+ [title, lines.join("\n")]
55
+ end
56
+
57
+ def extract_outlines_and_content(content)
58
+ lines = content.split(/\n/)
59
+ outlines = []
60
+ while line = lines.shift
61
+ if HeaderDetect.guess_header?(line)
62
+ outlines << line
63
+ else
64
+ break
65
+ end
66
+ end
67
+
68
+ lines.unshift(line) if line.present?
69
+
70
+ if outlines.count > 1
71
+ [outlines.join("\n"),lines.join("\n")]
72
+ else
73
+ content = (outlines + lines).join("\n")
74
+ [nil,content]
75
+ end
76
+ end
77
+
78
+ def gen_html_from_txt_book(title,outlines,content,options={})
79
+ html = "<h1>#{title}</h1>"
80
+ html = html + gen_html_from_txt_outlines(outlines,options)
81
+ html = html + gen_html_from_txt_content(content,options)
82
+ end
83
+
84
+ def gen_html_from_txt_outlines(outlines,options={})
85
+ if outlines.present?
86
+ html = outlines.split("\n").map{|item|
87
+ "<li>#{item}</li>"
88
+ }
89
+ "<ol class='outlines'>#{html}</ol>"
90
+ else
91
+ ''
92
+ end
93
+ end
94
+
95
+ def gen_html_from_txt_content(content,options={})
96
+ row_index = 0
97
+ html = content.split("\n").map do |line|
98
+ row_index += 1
99
+ if line.present?
100
+ if HeaderDetect.guess_header?(line)
101
+ "<h2 id='#{row_index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
102
+ else
103
+ "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>"
104
+ end
105
+ end
106
+ end.compact.join("")
107
+ end
108
+ end