ebook_tools 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/README +76 -0
- data/bin/ebook_tools +196 -0
- data/ebook_tools.gemspec +38 -0
- data/lib/ebook_tools.rb +248 -0
- data/lib/epub.rb +104 -0
- data/lib/extract_book_struct.rb +415 -0
- data/lib/header_detect.rb +161 -0
- data/lib/pdf.rb +265 -0
- data/lib/txt.rb +108 -0
- data/lib/utils.rb +224 -0
- metadata +170 -0
data/lib/utils.rb
ADDED
@@ -0,0 +1,224 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'levenshtein'
|
3
|
+
require 'cgi'
|
4
|
+
require 'pathname'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'iconv'
|
7
|
+
|
8
|
+
class Object
|
9
|
+
def blank?
|
10
|
+
respond_to?(:empty?) ? empty? : !self
|
11
|
+
end
|
12
|
+
|
13
|
+
def present?
|
14
|
+
!blank?
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module Utils
|
19
|
+
extend self
|
20
|
+
|
21
|
+
# fixed_page_break
|
22
|
+
# 修复文本中的异常中断
|
23
|
+
# parameters:
|
24
|
+
# +page_text+ 文本内容
|
25
|
+
def fixed_page_break(page_text,options={})
|
26
|
+
page_lines = []
|
27
|
+
length = options[:length] || guess_content_line_length(page_text)
|
28
|
+
|
29
|
+
page_text.each_line do |line|
|
30
|
+
line.gsub!("\r\n","")
|
31
|
+
line.gsub!("\n","")
|
32
|
+
line.strip!
|
33
|
+
page_lines << line
|
34
|
+
end
|
35
|
+
|
36
|
+
lines = []
|
37
|
+
flag_tag = false
|
38
|
+
page_lines.each do |line|
|
39
|
+
if line.length > 0
|
40
|
+
if flag_tag
|
41
|
+
lines[(lines.count - 1)] = merge_para_part(lines.last,line)
|
42
|
+
else
|
43
|
+
lines << line
|
44
|
+
end
|
45
|
+
if line_closed?(line,length)
|
46
|
+
flag_tag = false
|
47
|
+
else
|
48
|
+
flag_tag = true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
lines.join("\n")
|
53
|
+
end
|
54
|
+
|
55
|
+
# 计算文本相似度
|
56
|
+
def text_similarity(text1,text2)
|
57
|
+
return 0 if text1.blank? || text2.blank?
|
58
|
+
diff = Levenshtein.distance(text1,text2)
|
59
|
+
count = text1.length > text2.length ? text1.length : text2.length
|
60
|
+
similarity = (count - diff) / count.to_f
|
61
|
+
rescue
|
62
|
+
0
|
63
|
+
end
|
64
|
+
|
65
|
+
# line_closed?
|
66
|
+
# 判断是否为一行的结束。如何算一行结束?
|
67
|
+
# * 以句子结束符结尾的
|
68
|
+
# * 非结束符结束,但长度小于猜测的行长度的
|
69
|
+
# parameters:
|
70
|
+
# +text+ 一行的文本内容
|
71
|
+
def line_closed?(text,length=60)
|
72
|
+
return true if end_mark?(text)
|
73
|
+
short_text = text.gsub(/[\.\-—. ]/,'')
|
74
|
+
if short_text =~ /\p{Han}/
|
75
|
+
return true if short_text.length > 80
|
76
|
+
return true if short_text.length < length * 2
|
77
|
+
else
|
78
|
+
return true if short_text.length > 80
|
79
|
+
return true if short_text.length < length
|
80
|
+
end
|
81
|
+
false
|
82
|
+
end
|
83
|
+
|
84
|
+
def end_mark?(text)
|
85
|
+
end_mark = [".","。",'"','!','?','!','?','…','>']
|
86
|
+
return true if end_mark.include?(text[-1])
|
87
|
+
end
|
88
|
+
|
89
|
+
def merge_para_part(part1,part2)
|
90
|
+
if part2 =~ /\p{Han}/
|
91
|
+
[part1,part2].join("")
|
92
|
+
else
|
93
|
+
[part1,part2].join(" ")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def guess_content_line_length(content)
|
98
|
+
line_length = 0
|
99
|
+
return line_length if content.blank?
|
100
|
+
lengths = []
|
101
|
+
content.each_line{|line|
|
102
|
+
lengths << line.length
|
103
|
+
}
|
104
|
+
while true
|
105
|
+
line_length = lengths.pop
|
106
|
+
break if line_length < 80
|
107
|
+
end
|
108
|
+
return line_length
|
109
|
+
end
|
110
|
+
|
111
|
+
# clean_text
|
112
|
+
# 获得干净的文本,去除两边的空格和回车,主要在txt标题转换成html时使用
|
113
|
+
def clean_text(text)
|
114
|
+
return text if text.nil?
|
115
|
+
text = text.strip
|
116
|
+
text.gsub("\n",'')
|
117
|
+
end
|
118
|
+
|
119
|
+
# escape_html
|
120
|
+
# 文本转义,在txt文本转html时需要使用
|
121
|
+
def escape_html(text)
|
122
|
+
CGI::escapeHTML(text)
|
123
|
+
end
|
124
|
+
|
125
|
+
def wrapper_html(content,options={})
|
126
|
+
<<-EOS
|
127
|
+
<!DOCTYPE html>
|
128
|
+
<HTML xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
|
129
|
+
<HEAD>
|
130
|
+
<TITLE>#{options[:title]}</TITLE>
|
131
|
+
<META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
132
|
+
</HEAD>
|
133
|
+
<body>
|
134
|
+
#{content}
|
135
|
+
</body>
|
136
|
+
EOS
|
137
|
+
end
|
138
|
+
|
139
|
+
def to_utf8(text,encoding='GB2312')
|
140
|
+
doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
|
141
|
+
doc.join("")
|
142
|
+
#text.encode(encoding)
|
143
|
+
rescue
|
144
|
+
nil
|
145
|
+
end
|
146
|
+
|
147
|
+
def detect_utf8(content)
|
148
|
+
content.each_line{|line|line.strip}
|
149
|
+
true
|
150
|
+
rescue
|
151
|
+
false
|
152
|
+
end
|
153
|
+
|
154
|
+
# scan_file_from_dir
|
155
|
+
# 遍历目录下的文件
|
156
|
+
# parameters:
|
157
|
+
# +dir+ 需遍历的目录
|
158
|
+
# +options+ 可选参数
|
159
|
+
# :format 指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定
|
160
|
+
def scan_file_from_dir(dir,options={})
|
161
|
+
files = []
|
162
|
+
walk_dir(dir,options) do |file|
|
163
|
+
files << file.to_s
|
164
|
+
end
|
165
|
+
files
|
166
|
+
end
|
167
|
+
|
168
|
+
def write_file(text, filename)
|
169
|
+
File.open(filename,'wb') do |file|
|
170
|
+
file.write text
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# source_exists?
|
175
|
+
# detect source file or directory
|
176
|
+
# parameters:
|
177
|
+
# +source+ file or directory
|
178
|
+
# +dir_flag+ directory flag, default nil.
|
179
|
+
def source_exists?(source,dir_flag=nil)
|
180
|
+
if dir_flag
|
181
|
+
File.directory?(source)
|
182
|
+
else
|
183
|
+
File.exists?(source)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def make_destination_dir(destination)
|
188
|
+
dest_path = File.dirname(destination)
|
189
|
+
FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
|
190
|
+
end
|
191
|
+
|
192
|
+
# 根据路径提取关键词
|
193
|
+
def extract_keywords_from_path(path)
|
194
|
+
keywords = path.split(/[\\\/]/).map{|key| key if key.strip != ''}.compact
|
195
|
+
end
|
196
|
+
|
197
|
+
def walk_dir(path_str,options={})
|
198
|
+
path = Pathname.new(path_str)
|
199
|
+
format = options[:format]
|
200
|
+
path.children.each do |entry|
|
201
|
+
if entry.directory?
|
202
|
+
walk_dir(entry) {|x| yield(x)}
|
203
|
+
elsif entry.file?
|
204
|
+
if format
|
205
|
+
if entry.extname == format
|
206
|
+
yield entry
|
207
|
+
end
|
208
|
+
else
|
209
|
+
yield entry
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def detect_sections_from_html(html_file)
|
216
|
+
sections = []
|
217
|
+
html = Nokogiri::HTML.parse(File.open(html_file).read)
|
218
|
+
html.search('h2').each do |node|
|
219
|
+
sections << {:title=>node.text,:page_num=>node['id']}
|
220
|
+
end
|
221
|
+
sections
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
metadata
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ebook_tools
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: uuid
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: iconv
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: gepub
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: poppler
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: pdf-reader
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: nokogiri
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: levenshtein
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
description: 电子书工具集.
|
127
|
+
email: aaron@nonobo.com
|
128
|
+
executables:
|
129
|
+
- ebook_tools
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- README
|
134
|
+
- CHANGELOG
|
135
|
+
- bin/ebook_tools
|
136
|
+
- lib/ebook_tools.rb
|
137
|
+
- lib/extract_book_struct.rb
|
138
|
+
- lib/header_detect.rb
|
139
|
+
- lib/pdf.rb
|
140
|
+
- lib/txt.rb
|
141
|
+
- lib/epub.rb
|
142
|
+
- lib/utils.rb
|
143
|
+
- ebook_tools.gemspec
|
144
|
+
homepage:
|
145
|
+
licenses: []
|
146
|
+
post_install_message:
|
147
|
+
rdoc_options:
|
148
|
+
- --charset=UTF-8
|
149
|
+
require_paths:
|
150
|
+
- lib
|
151
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
152
|
+
none: false
|
153
|
+
requirements:
|
154
|
+
- - ! '>='
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '0'
|
157
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
+
none: false
|
159
|
+
requirements:
|
160
|
+
- - ! '>='
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: '0'
|
163
|
+
requirements:
|
164
|
+
- none
|
165
|
+
rubyforge_project:
|
166
|
+
rubygems_version: 1.8.25
|
167
|
+
signing_key:
|
168
|
+
specification_version: 3
|
169
|
+
summary: 电子书工具集.
|
170
|
+
test_files: []
|