ebook_tools 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,224 @@
1
+ # encoding: UTF-8
2
+ require 'levenshtein'
3
+ require 'cgi'
4
+ require 'pathname'
5
+ require 'fileutils'
6
+ require 'iconv'
7
+
8
+ class Object
9
+ def blank?
10
+ respond_to?(:empty?) ? empty? : !self
11
+ end
12
+
13
+ def present?
14
+ !blank?
15
+ end
16
+ end
17
+
18
+ module Utils
19
+ extend self
20
+
21
+ # fixed_page_break
22
+ # 修复文本中的异常中断
23
+ # parameters:
24
+ # +page_text+ 文本内容
25
+ def fixed_page_break(page_text,options={})
26
+ page_lines = []
27
+ length = options[:length] || guess_content_line_length(page_text)
28
+
29
+ page_text.each_line do |line|
30
+ line.gsub!("\r\n","")
31
+ line.gsub!("\n","")
32
+ line.strip!
33
+ page_lines << line
34
+ end
35
+
36
+ lines = []
37
+ flag_tag = false
38
+ page_lines.each do |line|
39
+ if line.length > 0
40
+ if flag_tag
41
+ lines[(lines.count - 1)] = merge_para_part(lines.last,line)
42
+ else
43
+ lines << line
44
+ end
45
+ if line_closed?(line,length)
46
+ flag_tag = false
47
+ else
48
+ flag_tag = true
49
+ end
50
+ end
51
+ end
52
+ lines.join("\n")
53
+ end
54
+
55
+ # 计算文本相似度
56
+ def text_similarity(text1,text2)
57
+ return 0 if text1.blank? || text2.blank?
58
+ diff = Levenshtein.distance(text1,text2)
59
+ count = text1.length > text2.length ? text1.length : text2.length
60
+ similarity = (count - diff) / count.to_f
61
+ rescue
62
+ 0
63
+ end
64
+
65
+ # line_closed?
66
+ # 判断是否为一行的结束。如何算一行结束?
67
+ # * 以句子结束符结尾的
68
+ # * 非结束符结束,但长度小于猜测的行长度的
69
+ # parameters:
70
+ # +text+ 一行的文本内容
71
+ def line_closed?(text,length=60)
72
+ return true if end_mark?(text)
73
+ short_text = text.gsub(/[\.\-—. ]/,'')
74
+ if short_text =~ /\p{Han}/
75
+ return true if short_text.length > 80
76
+ return true if short_text.length < length * 2
77
+ else
78
+ return true if short_text.length > 80
79
+ return true if short_text.length < length
80
+ end
81
+ false
82
+ end
83
+
84
+ def end_mark?(text)
85
+ end_mark = [".","。",'"','!','?','!','?','…','>']
86
+ return true if end_mark.include?(text[-1])
87
+ end
88
+
89
+ def merge_para_part(part1,part2)
90
+ if part2 =~ /\p{Han}/
91
+ [part1,part2].join("")
92
+ else
93
+ [part1,part2].join(" ")
94
+ end
95
+ end
96
+
97
+ def guess_content_line_length(content)
98
+ line_length = 0
99
+ return line_length if content.blank?
100
+ lengths = []
101
+ content.each_line{|line|
102
+ lengths << line.length
103
+ }
104
+ while true
105
+ line_length = lengths.pop
106
+ break if line_length < 80
107
+ end
108
+ return line_length
109
+ end
110
+
111
+ # clean_text
112
+ # 获得干净的文本,去除两边的空格和回车,主要在txt标题转换成html时使用
113
+ def clean_text(text)
114
+ return text if text.nil?
115
+ text = text.strip
116
+ text.gsub("\n",'')
117
+ end
118
+
119
+ # escape_html
120
+ # 文本转义,在txt文本转html时需要使用
121
+ def escape_html(text)
122
+ CGI::escapeHTML(text)
123
+ end
124
+
125
+ def wrapper_html(content,options={})
126
+ <<-EOS
127
+ <!DOCTYPE html>
128
+ <HTML xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
129
+ <HEAD>
130
+ <TITLE>#{options[:title]}</TITLE>
131
+ <META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
132
+ </HEAD>
133
+ <body>
134
+ #{content}
135
+ </body>
136
+ EOS
137
+ end
138
+
139
+ def to_utf8(text,encoding='GB2312')
140
+ doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
141
+ doc.join("")
142
+ #text.encode(encoding)
143
+ rescue
144
+ nil
145
+ end
146
+
147
+ def detect_utf8(content)
148
+ content.each_line{|line|line.strip}
149
+ true
150
+ rescue
151
+ false
152
+ end
153
+
154
+ # scan_file_from_dir
155
+ # 遍历目录下的文件
156
+ # parameters:
157
+ # +dir+ 需遍历的目录
158
+ # +options+ 可选参数
159
+ # :format 指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定
160
+ def scan_file_from_dir(dir,options={})
161
+ files = []
162
+ walk_dir(dir,options) do |file|
163
+ files << file.to_s
164
+ end
165
+ files
166
+ end
167
+
168
+ def write_file(text, filename)
169
+ File.open(filename,'wb') do |file|
170
+ file.write text
171
+ end
172
+ end
173
+
174
+ # source_exists?
175
+ # detect source file or directory
176
+ # parameters:
177
+ # +source+ file or directory
178
+ # +dir_flag+ directory flag, default nil.
179
+ def source_exists?(source,dir_flag=nil)
180
+ if dir_flag
181
+ File.directory?(source)
182
+ else
183
+ File.exists?(source)
184
+ end
185
+ end
186
+
187
+ def make_destination_dir(destination)
188
+ dest_path = File.dirname(destination)
189
+ FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
190
+ end
191
+
192
+ # 根据路径提取关键词
193
+ def extract_keywords_from_path(path)
194
+ keywords = path.split(/[\\\/]/).map{|key| key if key.strip != ''}.compact
195
+ end
196
+
197
+ def walk_dir(path_str,options={})
198
+ path = Pathname.new(path_str)
199
+ format = options[:format]
200
+ path.children.each do |entry|
201
+ if entry.directory?
202
+ walk_dir(entry) {|x| yield(x)}
203
+ elsif entry.file?
204
+ if format
205
+ if entry.extname == format
206
+ yield entry
207
+ end
208
+ else
209
+ yield entry
210
+ end
211
+ end
212
+ end
213
+ end
214
+
215
+ def detect_sections_from_html(html_file)
216
+ sections = []
217
+ html = Nokogiri::HTML.parse(File.open(html_file).read)
218
+ html.search('h2').each do |node|
219
+ sections << {:title=>node.text,:page_num=>node['id']}
220
+ end
221
+ sections
222
+ end
223
+
224
+ end
metadata ADDED
@@ -0,0 +1,170 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ebook_tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aaron
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: uuid
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: iconv
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: gepub
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: poppler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: pdf-reader
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: nokogiri
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: levenshtein
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ description: 电子书工具集.
127
+ email: aaron@nonobo.com
128
+ executables:
129
+ - ebook_tools
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - README
134
+ - CHANGELOG
135
+ - bin/ebook_tools
136
+ - lib/ebook_tools.rb
137
+ - lib/extract_book_struct.rb
138
+ - lib/header_detect.rb
139
+ - lib/pdf.rb
140
+ - lib/txt.rb
141
+ - lib/epub.rb
142
+ - lib/utils.rb
143
+ - ebook_tools.gemspec
144
+ homepage:
145
+ licenses: []
146
+ post_install_message:
147
+ rdoc_options:
148
+ - --charset=UTF-8
149
+ require_paths:
150
+ - lib
151
+ required_ruby_version: !ruby/object:Gem::Requirement
152
+ none: false
153
+ requirements:
154
+ - - ! '>='
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ required_rubygems_version: !ruby/object:Gem::Requirement
158
+ none: false
159
+ requirements:
160
+ - - ! '>='
161
+ - !ruby/object:Gem::Version
162
+ version: '0'
163
+ requirements:
164
+ - none
165
+ rubyforge_project:
166
+ rubygems_version: 1.8.25
167
+ signing_key:
168
+ specification_version: 3
169
+ summary: 电子书工具集.
170
+ test_files: []