ebook_tools 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/README +76 -0
- data/bin/ebook_tools +196 -0
- data/ebook_tools.gemspec +38 -0
- data/lib/ebook_tools.rb +248 -0
- data/lib/epub.rb +104 -0
- data/lib/extract_book_struct.rb +415 -0
- data/lib/header_detect.rb +161 -0
- data/lib/pdf.rb +265 -0
- data/lib/txt.rb +108 -0
- data/lib/utils.rb +224 -0
- metadata +170 -0
data/lib/utils.rb
ADDED
@@ -0,0 +1,224 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'levenshtein'
|
3
|
+
require 'cgi'
|
4
|
+
require 'pathname'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'iconv'
|
7
|
+
|
8
|
+
class Object
|
9
|
+
def blank?
|
10
|
+
respond_to?(:empty?) ? empty? : !self
|
11
|
+
end
|
12
|
+
|
13
|
+
def present?
|
14
|
+
!blank?
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module Utils
|
19
|
+
extend self
|
20
|
+
|
21
|
+
# fixed_page_break
|
22
|
+
# 修复文本中的异常中断
|
23
|
+
# parameters:
|
24
|
+
# +page_text+ 文本内容
|
25
|
+
def fixed_page_break(page_text,options={})
|
26
|
+
page_lines = []
|
27
|
+
length = options[:length] || guess_content_line_length(page_text)
|
28
|
+
|
29
|
+
page_text.each_line do |line|
|
30
|
+
line.gsub!("\r\n","")
|
31
|
+
line.gsub!("\n","")
|
32
|
+
line.strip!
|
33
|
+
page_lines << line
|
34
|
+
end
|
35
|
+
|
36
|
+
lines = []
|
37
|
+
flag_tag = false
|
38
|
+
page_lines.each do |line|
|
39
|
+
if line.length > 0
|
40
|
+
if flag_tag
|
41
|
+
lines[(lines.count - 1)] = merge_para_part(lines.last,line)
|
42
|
+
else
|
43
|
+
lines << line
|
44
|
+
end
|
45
|
+
if line_closed?(line,length)
|
46
|
+
flag_tag = false
|
47
|
+
else
|
48
|
+
flag_tag = true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
lines.join("\n")
|
53
|
+
end
|
54
|
+
|
55
|
+
# 计算文本相似度
|
56
|
+
def text_similarity(text1,text2)
|
57
|
+
return 0 if text1.blank? || text2.blank?
|
58
|
+
diff = Levenshtein.distance(text1,text2)
|
59
|
+
count = text1.length > text2.length ? text1.length : text2.length
|
60
|
+
similarity = (count - diff) / count.to_f
|
61
|
+
rescue
|
62
|
+
0
|
63
|
+
end
|
64
|
+
|
65
|
+
# line_closed?
|
66
|
+
# 判断是否为一行的结束。如何算一行结束?
|
67
|
+
# * 以句子结束符结尾的
|
68
|
+
# * 非结束符结束,但长度小于猜测的行长度的
|
69
|
+
# parameters:
|
70
|
+
# +text+ 一行的文本内容
|
71
|
+
def line_closed?(text,length=60)
|
72
|
+
return true if end_mark?(text)
|
73
|
+
short_text = text.gsub(/[\.\-—. ]/,'')
|
74
|
+
if short_text =~ /\p{Han}/
|
75
|
+
return true if short_text.length > 80
|
76
|
+
return true if short_text.length < length * 2
|
77
|
+
else
|
78
|
+
return true if short_text.length > 80
|
79
|
+
return true if short_text.length < length
|
80
|
+
end
|
81
|
+
false
|
82
|
+
end
|
83
|
+
|
84
|
+
def end_mark?(text)
|
85
|
+
end_mark = [".","。",'"','!','?','!','?','…','>']
|
86
|
+
return true if end_mark.include?(text[-1])
|
87
|
+
end
|
88
|
+
|
89
|
+
def merge_para_part(part1,part2)
|
90
|
+
if part2 =~ /\p{Han}/
|
91
|
+
[part1,part2].join("")
|
92
|
+
else
|
93
|
+
[part1,part2].join(" ")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def guess_content_line_length(content)
|
98
|
+
line_length = 0
|
99
|
+
return line_length if content.blank?
|
100
|
+
lengths = []
|
101
|
+
content.each_line{|line|
|
102
|
+
lengths << line.length
|
103
|
+
}
|
104
|
+
while true
|
105
|
+
line_length = lengths.pop
|
106
|
+
break if line_length < 80
|
107
|
+
end
|
108
|
+
return line_length
|
109
|
+
end
|
110
|
+
|
111
|
+
# clean_text
|
112
|
+
# 获得干净的文本,去除两边的空格和回车,主要在txt标题转换成html时使用
|
113
|
+
def clean_text(text)
|
114
|
+
return text if text.nil?
|
115
|
+
text = text.strip
|
116
|
+
text.gsub("\n",'')
|
117
|
+
end
|
118
|
+
|
119
|
+
# escape_html
|
120
|
+
# 文本转义,在txt文本转html时需要使用
|
121
|
+
def escape_html(text)
|
122
|
+
CGI::escapeHTML(text)
|
123
|
+
end
|
124
|
+
|
125
|
+
def wrapper_html(content,options={})
|
126
|
+
<<-EOS
|
127
|
+
<!DOCTYPE html>
|
128
|
+
<HTML xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
|
129
|
+
<HEAD>
|
130
|
+
<TITLE>#{options[:title]}</TITLE>
|
131
|
+
<META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
132
|
+
</HEAD>
|
133
|
+
<body>
|
134
|
+
#{content}
|
135
|
+
</body>
|
136
|
+
EOS
|
137
|
+
end
|
138
|
+
|
139
|
+
def to_utf8(text,encoding='GB2312')
|
140
|
+
doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
|
141
|
+
doc.join("")
|
142
|
+
#text.encode(encoding)
|
143
|
+
rescue
|
144
|
+
nil
|
145
|
+
end
|
146
|
+
|
147
|
+
def detect_utf8(content)
|
148
|
+
content.each_line{|line|line.strip}
|
149
|
+
true
|
150
|
+
rescue
|
151
|
+
false
|
152
|
+
end
|
153
|
+
|
154
|
+
# scan_file_from_dir
|
155
|
+
# 遍历目录下的文件
|
156
|
+
# parameters:
|
157
|
+
# +dir+ 需遍历的目录
|
158
|
+
# +options+ 可选参数
|
159
|
+
# :format 指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定
|
160
|
+
def scan_file_from_dir(dir,options={})
|
161
|
+
files = []
|
162
|
+
walk_dir(dir,options) do |file|
|
163
|
+
files << file.to_s
|
164
|
+
end
|
165
|
+
files
|
166
|
+
end
|
167
|
+
|
168
|
+
def write_file(text, filename)
|
169
|
+
File.open(filename,'wb') do |file|
|
170
|
+
file.write text
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# source_exists?
|
175
|
+
# detect source file or directory
|
176
|
+
# parameters:
|
177
|
+
# +source+ file or directory
|
178
|
+
# +dir_flag+ directory flag, default nil.
|
179
|
+
def source_exists?(source,dir_flag=nil)
|
180
|
+
if dir_flag
|
181
|
+
File.directory?(source)
|
182
|
+
else
|
183
|
+
File.exists?(source)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def make_destination_dir(destination)
|
188
|
+
dest_path = File.dirname(destination)
|
189
|
+
FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
|
190
|
+
end
|
191
|
+
|
192
|
+
# 根据路径提取关键词
|
193
|
+
def extract_keywords_from_path(path)
|
194
|
+
keywords = path.split(/[\\\/]/).map{|key| key if key.strip != ''}.compact
|
195
|
+
end
|
196
|
+
|
197
|
+
def walk_dir(path_str,options={})
|
198
|
+
path = Pathname.new(path_str)
|
199
|
+
format = options[:format]
|
200
|
+
path.children.each do |entry|
|
201
|
+
if entry.directory?
|
202
|
+
walk_dir(entry) {|x| yield(x)}
|
203
|
+
elsif entry.file?
|
204
|
+
if format
|
205
|
+
if entry.extname == format
|
206
|
+
yield entry
|
207
|
+
end
|
208
|
+
else
|
209
|
+
yield entry
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def detect_sections_from_html(html_file)
|
216
|
+
sections = []
|
217
|
+
html = Nokogiri::HTML.parse(File.open(html_file).read)
|
218
|
+
html.search('h2').each do |node|
|
219
|
+
sections << {:title=>node.text,:page_num=>node['id']}
|
220
|
+
end
|
221
|
+
sections
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
metadata
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ebook_tools
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: uuid
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: iconv
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: gepub
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: poppler
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: pdf-reader
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: nokogiri
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: levenshtein
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
description: 电子书工具集.
|
127
|
+
email: aaron@nonobo.com
|
128
|
+
executables:
|
129
|
+
- ebook_tools
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- README
|
134
|
+
- CHANGELOG
|
135
|
+
- bin/ebook_tools
|
136
|
+
- lib/ebook_tools.rb
|
137
|
+
- lib/extract_book_struct.rb
|
138
|
+
- lib/header_detect.rb
|
139
|
+
- lib/pdf.rb
|
140
|
+
- lib/txt.rb
|
141
|
+
- lib/epub.rb
|
142
|
+
- lib/utils.rb
|
143
|
+
- ebook_tools.gemspec
|
144
|
+
homepage:
|
145
|
+
licenses: []
|
146
|
+
post_install_message:
|
147
|
+
rdoc_options:
|
148
|
+
- --charset=UTF-8
|
149
|
+
require_paths:
|
150
|
+
- lib
|
151
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
152
|
+
none: false
|
153
|
+
requirements:
|
154
|
+
- - ! '>='
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '0'
|
157
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
+
none: false
|
159
|
+
requirements:
|
160
|
+
- - ! '>='
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: '0'
|
163
|
+
requirements:
|
164
|
+
- none
|
165
|
+
rubyforge_project:
|
166
|
+
rubygems_version: 1.8.25
|
167
|
+
signing_key:
|
168
|
+
specification_version: 3
|
169
|
+
summary: 电子书工具集.
|
170
|
+
test_files: []
|