ebook_tools 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,224 @@
1
+ # encoding: UTF-8
2
+ require 'levenshtein'
3
+ require 'cgi'
4
+ require 'pathname'
5
+ require 'fileutils'
6
+ require 'iconv'
7
+
8
+ class Object
9
+ def blank?
10
+ respond_to?(:empty?) ? empty? : !self
11
+ end
12
+
13
+ def present?
14
+ !blank?
15
+ end
16
+ end
17
+
18
+ module Utils
19
+ extend self
20
+
21
+ # fixed_page_break
22
+ # 修复文本中的异常中断
23
+ # parameters:
24
+ # +page_text+ 文本内容
25
+ def fixed_page_break(page_text,options={})
26
+ page_lines = []
27
+ length = options[:length] || guess_content_line_length(page_text)
28
+
29
+ page_text.each_line do |line|
30
+ line.gsub!("\r\n","")
31
+ line.gsub!("\n","")
32
+ line.strip!
33
+ page_lines << line
34
+ end
35
+
36
+ lines = []
37
+ flag_tag = false
38
+ page_lines.each do |line|
39
+ if line.length > 0
40
+ if flag_tag
41
+ lines[(lines.count - 1)] = merge_para_part(lines.last,line)
42
+ else
43
+ lines << line
44
+ end
45
+ if line_closed?(line,length)
46
+ flag_tag = false
47
+ else
48
+ flag_tag = true
49
+ end
50
+ end
51
+ end
52
+ lines.join("\n")
53
+ end
54
+
55
+ # 计算文本相似度
56
+ def text_similarity(text1,text2)
57
+ return 0 if text1.blank? || text2.blank?
58
+ diff = Levenshtein.distance(text1,text2)
59
+ count = text1.length > text2.length ? text1.length : text2.length
60
+ similarity = (count - diff) / count.to_f
61
+ rescue
62
+ 0
63
+ end
64
+
65
+ # line_closed?
66
+ # 判断是否为一行的结束。如何算一行结束?
67
+ # * 以句子结束符结尾的
68
+ # * 非结束符结束,但长度小于猜测的行长度的
69
+ # parameters:
70
+ # +text+ 一行的文本内容
71
+ def line_closed?(text,length=60)
72
+ return true if end_mark?(text)
73
+ short_text = text.gsub(/[\.\-—. ]/,'')
74
+ if short_text =~ /\p{Han}/
75
+ return true if short_text.length > 80
76
+ return true if short_text.length < length * 2
77
+ else
78
+ return true if short_text.length > 80
79
+ return true if short_text.length < length
80
+ end
81
+ false
82
+ end
83
+
84
+ def end_mark?(text)
85
+ end_mark = [".","。",'"','!','?','!','?','…','>']
86
+ return true if end_mark.include?(text[-1])
87
+ end
88
+
89
+ def merge_para_part(part1,part2)
90
+ if part2 =~ /\p{Han}/
91
+ [part1,part2].join("")
92
+ else
93
+ [part1,part2].join(" ")
94
+ end
95
+ end
96
+
97
+ def guess_content_line_length(content)
98
+ line_length = 0
99
+ return line_length if content.blank?
100
+ lengths = []
101
+ content.each_line{|line|
102
+ lengths << line.length
103
+ }
104
+ while true
105
+ line_length = lengths.pop
106
+ break if line_length < 80
107
+ end
108
+ return line_length
109
+ end
110
+
111
+ # clean_text
112
+ # 获得干净的文本,去除两边的空格和回车,主要在txt标题转换成html时使用
113
+ def clean_text(text)
114
+ return text if text.nil?
115
+ text = text.strip
116
+ text.gsub("\n",'')
117
+ end
118
+
119
+ # escape_html
120
+ # 文本转义,在txt文本转html时需要使用
121
+ def escape_html(text)
122
+ CGI::escapeHTML(text)
123
+ end
124
+
125
+ def wrapper_html(content,options={})
126
+ <<-EOS
127
+ <!DOCTYPE html>
128
+ <HTML xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
129
+ <HEAD>
130
+ <TITLE>#{options[:title]}</TITLE>
131
+ <META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
132
+ </HEAD>
133
+ <body>
134
+ #{content}
135
+ </body>
136
+ EOS
137
+ end
138
+
139
+ def to_utf8(text,encoding='GB2312')
140
+ doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
141
+ doc.join("")
142
+ #text.encode(encoding)
143
+ rescue
144
+ nil
145
+ end
146
+
147
+ def detect_utf8(content)
148
+ content.each_line{|line|line.strip}
149
+ true
150
+ rescue
151
+ false
152
+ end
153
+
154
+ # scan_file_from_dir
155
+ # 遍历目录下的文件
156
+ # parameters:
157
+ # +dir+ 需遍历的目录
158
+ # +options+ 可选参数
159
+ # :format 指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定
160
+ def scan_file_from_dir(dir,options={})
161
+ files = []
162
+ walk_dir(dir,options) do |file|
163
+ files << file.to_s
164
+ end
165
+ files
166
+ end
167
+
168
+ def write_file(text, filename)
169
+ File.open(filename,'wb') do |file|
170
+ file.write text
171
+ end
172
+ end
173
+
174
+ # source_exists?
175
+ # detect source file or directory
176
+ # parameters:
177
+ # +source+ file or directory
178
+ # +dir_flag+ directory flag, default nil.
179
+ def source_exists?(source,dir_flag=nil)
180
+ if dir_flag
181
+ File.directory?(source)
182
+ else
183
+ File.exists?(source)
184
+ end
185
+ end
186
+
187
+ def make_destination_dir(destination)
188
+ dest_path = File.dirname(destination)
189
+ FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
190
+ end
191
+
192
+ # 根据路径提取关键词
193
+ def extract_keywords_from_path(path)
194
+ keywords = path.split(/[\\\/]/).map{|key| key if key.strip != ''}.compact
195
+ end
196
+
197
+ def walk_dir(path_str,options={})
198
+ path = Pathname.new(path_str)
199
+ format = options[:format]
200
+ path.children.each do |entry|
201
+ if entry.directory?
202
+ walk_dir(entry) {|x| yield(x)}
203
+ elsif entry.file?
204
+ if format
205
+ if entry.extname == format
206
+ yield entry
207
+ end
208
+ else
209
+ yield entry
210
+ end
211
+ end
212
+ end
213
+ end
214
+
215
+ def detect_sections_from_html(html_file)
216
+ sections = []
217
+ html = Nokogiri::HTML.parse(File.open(html_file).read)
218
+ html.search('h2').each do |node|
219
+ sections << {:title=>node.text,:page_num=>node['id']}
220
+ end
221
+ sections
222
+ end
223
+
224
+ end
metadata ADDED
@@ -0,0 +1,170 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ebook_tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aaron
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: uuid
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: iconv
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: gepub
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: poppler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: pdf-reader
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: nokogiri
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: levenshtein
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ description: 电子书工具集.
127
+ email: aaron@nonobo.com
128
+ executables:
129
+ - ebook_tools
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - README
134
+ - CHANGELOG
135
+ - bin/ebook_tools
136
+ - lib/ebook_tools.rb
137
+ - lib/extract_book_struct.rb
138
+ - lib/header_detect.rb
139
+ - lib/pdf.rb
140
+ - lib/txt.rb
141
+ - lib/epub.rb
142
+ - lib/utils.rb
143
+ - ebook_tools.gemspec
144
+ homepage:
145
+ licenses: []
146
+ post_install_message:
147
+ rdoc_options:
148
+ - --charset=UTF-8
149
+ require_paths:
150
+ - lib
151
+ required_ruby_version: !ruby/object:Gem::Requirement
152
+ none: false
153
+ requirements:
154
+ - - ! '>='
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ required_rubygems_version: !ruby/object:Gem::Requirement
158
+ none: false
159
+ requirements:
160
+ - - ! '>='
161
+ - !ruby/object:Gem::Version
162
+ version: '0'
163
+ requirements:
164
+ - none
165
+ rubyforge_project:
166
+ rubygems_version: 1.8.25
167
+ signing_key:
168
+ specification_version: 3
169
+ summary: 电子书工具集.
170
+ test_files: []