ebook_tools 0.0.6 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +3 -0
- data/ebook_tools.gemspec +2 -2
- data/lib/ebook_tools.rb +45 -11
- data/lib/{extract_book_struct.rb → txt_book.rb} +94 -154
- data/lib/utils.rb +28 -8
- metadata +2 -2
data/CHANGELOG
CHANGED
data/ebook_tools.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.0
|
5
|
+
s.version = '0.1.0'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
|
|
25
25
|
"bin/doc_book_import_mongo",
|
26
26
|
"bin/xml2json",
|
27
27
|
"lib/ebook_tools.rb",
|
28
|
-
"lib/
|
28
|
+
"lib/txt_book.rb",
|
29
29
|
"lib/header_detect.rb",
|
30
30
|
"lib/pdf.rb",
|
31
31
|
"lib/txt.rb",
|
data/lib/ebook_tools.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: UTF-8
|
3
|
-
['utils','epub','txt','pdf','header_detect','
|
3
|
+
['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
|
4
4
|
require File.join(File.dirname(__FILE__),file)
|
5
5
|
end
|
6
6
|
|
@@ -201,16 +201,27 @@ module EbookTools
|
|
201
201
|
end
|
202
202
|
|
203
203
|
def extract_book_struct_to_file(source,destination,options={})
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
204
|
+
options[:title] ||= File.basename(source,File.extname(source))
|
205
|
+
content = case File.extname(source)
|
206
|
+
when '.html'
|
207
|
+
extract_text_from_file(source,'.html')
|
208
|
+
when '.epub'
|
209
|
+
text = extract_text_from_file(source,'.epub')
|
210
|
+
sanitize_for_epub_text(text)
|
211
|
+
when '.txt'
|
212
|
+
File.open(source).read
|
213
|
+
end
|
214
|
+
txt_book = TxtBook.new(content,options)
|
215
|
+
docbook_xml = txt_book.to_doc_book
|
216
|
+
if docbook_xml
|
217
|
+
FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
|
218
|
+
File.open(destination,'wb'){|file|file.write docbook_xml}
|
219
|
+
puts "目录结构:"
|
220
|
+
puts txt_book.toc_to_text
|
221
|
+
puts "共修复#{txt_book.breaklines_count}个断点."
|
222
|
+
return true
|
223
|
+
else
|
224
|
+
return nil
|
214
225
|
end
|
215
226
|
end
|
216
227
|
|
@@ -254,4 +265,27 @@ module EbookTools
|
|
254
265
|
content = Utils.fixed_page_break(content,options)
|
255
266
|
File.open(target_file,'w'){|file| file.write content}
|
256
267
|
end
|
268
|
+
|
269
|
+
def extract_text_from_file(filename,format)
|
270
|
+
txt_file = File.basename(filename,format)
|
271
|
+
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
272
|
+
output = `#{cmd}`
|
273
|
+
content = File.open("#{txt_file}.txt").read
|
274
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
275
|
+
return content
|
276
|
+
end
|
277
|
+
|
278
|
+
# sanitize_for_epub_text
|
279
|
+
def sanitize_for_epub_text(content)
|
280
|
+
return content if content.blank?
|
281
|
+
lines = []
|
282
|
+
content.each_line do |line|
|
283
|
+
unless line.downcase.include?('document outline')
|
284
|
+
lines << line
|
285
|
+
else
|
286
|
+
break;
|
287
|
+
end
|
288
|
+
end
|
289
|
+
lines.join("")
|
290
|
+
end
|
257
291
|
end
|
@@ -1,7 +1,10 @@
|
|
1
|
-
# encoding:
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'uuid'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
#=文本书籍
|
6
|
+
# 处理TXT格式的书籍。
|
7
|
+
#
|
5
8
|
# 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
|
6
9
|
# 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
|
7
10
|
# 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
|
@@ -9,145 +12,83 @@
|
|
9
12
|
# 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
|
10
13
|
# 6. 每个结构信息都应该独立成行。
|
11
14
|
#
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
# === ExtractBookStruct.from_txt
|
16
|
-
# 从文本文件中提取目录结构
|
17
|
-
#
|
18
|
-
# === ExtractBookStruct.from_epub
|
19
|
-
# 从EPUB文件中提取目录结构
|
20
|
-
#
|
21
|
-
# === ExtractBookStruct.from_html
|
22
|
-
# 从HTML中提取目录结构
|
15
|
+
class TxtBook
|
16
|
+
include HeaderDetect
|
17
|
+
attr_reader :title,:author,:publisher,:pubdate,:isbn,:content
|
23
18
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
module ExtractBookStruct
|
29
|
-
extend self
|
30
|
-
extend HeaderDetect
|
31
|
-
|
32
|
-
def from_txt(filename,options={})
|
33
|
-
options[:title] ||= File.basename(filename,File.extname(filename))
|
19
|
+
def self.load(filename,options={})
|
20
|
+
raise '无效的文件' unless File.exists?(filename)
|
21
|
+
options[:title] = File.basename(filename, File.extname(filename))
|
34
22
|
content = File.open(filename).read
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
def from_html(filename,options={})
|
39
|
-
options[:title] ||= File.basename(filename,File.extname(filename))
|
40
|
-
content = extract_text_from_file(filename,'.html')
|
41
|
-
extract_book_struct(content,options)
|
42
|
-
end
|
43
|
-
|
44
|
-
def from_epub(filename,options={})
|
45
|
-
options[:title] ||= File.basename(filename,File.extname(filename))
|
46
|
-
content = extract_text_from_file(filename,'.epub')
|
47
|
-
extract_book_struct(content,options)
|
23
|
+
new(content,options)
|
48
24
|
end
|
25
|
+
|
26
|
+
def initialize(content,options={})
|
27
|
+
@title = options[:title]
|
28
|
+
@author = options[:author]
|
29
|
+
@publisher = options[:publisher]
|
30
|
+
@pubdate= options[:pubdate]
|
31
|
+
@isbn = options[:isbn]
|
32
|
+
@format = options[:format]
|
49
33
|
|
50
|
-
def extract_book_struct(content,options={})
|
51
34
|
unless Utils.detect_utf8(content)
|
52
35
|
content = Utils.to_utf8(content)
|
53
36
|
end
|
54
|
-
content =
|
55
|
-
paras = extract_paras(content)
|
56
|
-
# 检查书类型(text,digital,hybrid)
|
57
|
-
format = options[:format] || detect_struct_type(paras)
|
58
|
-
case format
|
59
|
-
when :text
|
60
|
-
extract_text_book_struct(paras,options)
|
61
|
-
when :digital
|
62
|
-
extract_digital_book_struct(paras,options)
|
63
|
-
when :hybrid
|
64
|
-
extract_hybrid_book_struct(paras,options)
|
65
|
-
else
|
66
|
-
return nil
|
67
|
-
end
|
37
|
+
@content = content
|
68
38
|
end
|
69
39
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
40
|
+
def struct_content
|
41
|
+
return @struct_content if @struct_content
|
42
|
+
content = if breaklines_count > 100
|
43
|
+
Utils.fixed_page_break(@content)
|
44
|
+
else
|
45
|
+
@content
|
46
|
+
end
|
47
|
+
@struct_content = extract_book_struct(content,:format=>@format)
|
77
48
|
end
|
78
49
|
|
79
|
-
def
|
80
|
-
|
81
|
-
return paras if content.blank?
|
82
|
-
content.each_line do |line|
|
83
|
-
text = Utils.clean_text(line)
|
84
|
-
paras << text if text.length > 0
|
85
|
-
end
|
86
|
-
paras
|
50
|
+
def breaklines
|
51
|
+
@breaklines ||= Utils.breaklines(content)
|
87
52
|
end
|
88
53
|
|
89
|
-
def
|
90
|
-
|
91
|
-
digital_flag = false
|
92
|
-
paras.each do |para|
|
93
|
-
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
94
|
-
text_flag = true
|
95
|
-
end
|
96
|
-
|
97
|
-
if guess_digital_header?(para)
|
98
|
-
digital_flag = true
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
if text_flag && digital_flag
|
103
|
-
:hybrid
|
104
|
-
elsif text_flag
|
105
|
-
:text
|
106
|
-
elsif digital_flag
|
107
|
-
:digital
|
108
|
-
else
|
109
|
-
:unknown
|
110
|
-
end
|
54
|
+
def breaklines_count
|
55
|
+
breaklines.count
|
111
56
|
end
|
112
57
|
|
113
|
-
|
114
|
-
|
115
|
-
# 标注结构信息
|
116
|
-
marked_content = mark_struct_info(content)
|
117
|
-
# 构建书结构
|
118
|
-
struct = build_struct(marked_content)
|
119
|
-
# 修正结构
|
120
|
-
revised_struct = revise_struct(struct)
|
121
|
-
# 生成docbook
|
122
|
-
build_doc_book(revised_struct,options)
|
58
|
+
def toc
|
59
|
+
@toc ||= extract_toc_from_struct(struct_content)
|
123
60
|
end
|
124
61
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
# 构建书结构
|
130
|
-
struct = build_struct(marked_content)
|
131
|
-
|
132
|
-
# 修正结构
|
133
|
-
revised_struct = revise_struct(struct)
|
134
|
-
|
135
|
-
# 生成docbook
|
136
|
-
build_doc_book(revised_struct,options)
|
62
|
+
def toc_to_text
|
63
|
+
gen_toc(toc) do |item,children|
|
64
|
+
"#{item[:title]}\n#{children}"
|
65
|
+
end
|
137
66
|
end
|
138
67
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
struct = build_struct(marked_content)
|
145
|
-
|
146
|
-
# 修正结构
|
147
|
-
revised_struct = revise_struct(struct)
|
68
|
+
def to_doc_book
|
69
|
+
if struct_content
|
70
|
+
build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
71
|
+
end
|
72
|
+
end
|
148
73
|
|
149
|
-
|
150
|
-
|
74
|
+
private
|
75
|
+
def extract_book_struct(content,options={})
|
76
|
+
paras = extract_paras(content)
|
77
|
+
# 检查书类型(text,digital,hybrid)
|
78
|
+
format = options[:format] || detect_struct_type(paras)
|
79
|
+
marked_content = case format
|
80
|
+
when :text
|
81
|
+
mark_struct_info(paras)
|
82
|
+
when :digital
|
83
|
+
mark_digital_struct_info(paras)
|
84
|
+
when :hybrid
|
85
|
+
mark_hybrid_struct_info(paras)
|
86
|
+
else
|
87
|
+
return nil
|
88
|
+
end
|
89
|
+
if marked_content
|
90
|
+
build_struct(marked_content)
|
91
|
+
end
|
151
92
|
end
|
152
93
|
|
153
94
|
# 标注结构信息
|
@@ -207,14 +148,7 @@ module ExtractBookStruct
|
|
207
148
|
marked_content
|
208
149
|
end
|
209
150
|
|
210
|
-
# 修正结构 TODO
|
211
|
-
def revise_struct(struct)
|
212
|
-
struct
|
213
|
-
end
|
214
|
-
|
215
151
|
def build_doc_book(struct,options={})
|
216
|
-
toc = extract_toc_from_struct(struct)
|
217
|
-
|
218
152
|
doc_toc = gen_docbook_toc(toc)
|
219
153
|
|
220
154
|
struct = struct.map{|item| item if item.is_a?(Hash)}.compact
|
@@ -380,20 +314,6 @@ EOS
|
|
380
314
|
end
|
381
315
|
end
|
382
316
|
|
383
|
-
=begin
|
384
|
-
def gen_docbook_tocdiv(toc)
|
385
|
-
doc_toc = []
|
386
|
-
toc.each do |item|
|
387
|
-
children = ""
|
388
|
-
if item[:children].any?
|
389
|
-
children = gen_docbook_tocdiv(item[:children])
|
390
|
-
end
|
391
|
-
doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
|
392
|
-
end
|
393
|
-
doc_toc.join("")
|
394
|
-
end
|
395
|
-
=end
|
396
|
-
|
397
317
|
def gen_docbook_content(struct)
|
398
318
|
content = []
|
399
319
|
struct.each do |item|
|
@@ -420,17 +340,37 @@ EOS
|
|
420
340
|
content.join("\n")
|
421
341
|
end
|
422
342
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
lines << line
|
430
|
-
else
|
431
|
-
break;
|
343
|
+
def detect_struct_type(paras)
|
344
|
+
text_flag = false
|
345
|
+
digital_flag = false
|
346
|
+
paras.each do |para|
|
347
|
+
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
348
|
+
text_flag = true
|
432
349
|
end
|
350
|
+
|
351
|
+
if guess_digital_header?(para)
|
352
|
+
digital_flag = true
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
if text_flag && digital_flag
|
357
|
+
:hybrid
|
358
|
+
elsif text_flag
|
359
|
+
:text
|
360
|
+
elsif digital_flag
|
361
|
+
:digital
|
362
|
+
else
|
363
|
+
:unknown
|
433
364
|
end
|
434
|
-
|
365
|
+
end
|
366
|
+
|
367
|
+
def extract_paras(content)
|
368
|
+
paras = []
|
369
|
+
return paras if content.blank?
|
370
|
+
content.each_line do |line|
|
371
|
+
text = Utils.clean_text(line)
|
372
|
+
paras << text if text.length > 0
|
373
|
+
end
|
374
|
+
paras
|
435
375
|
end
|
436
376
|
end
|
data/lib/utils.rb
CHANGED
@@ -23,15 +23,8 @@ module Utils
|
|
23
23
|
# parameters:
|
24
24
|
# +page_text+ 文本内容
|
25
25
|
def fixed_page_break(page_text,options={})
|
26
|
-
page_lines = []
|
27
26
|
length = options[:length] || guess_content_line_length(page_text)
|
28
|
-
|
29
|
-
page_text.each_line do |line|
|
30
|
-
line.gsub!("\r\n","")
|
31
|
-
line.gsub!("\n","")
|
32
|
-
line.strip!
|
33
|
-
page_lines << line
|
34
|
-
end
|
27
|
+
page_lines = text_to_array(page_text)
|
35
28
|
|
36
29
|
lines = []
|
37
30
|
flag_tag = false
|
@@ -52,6 +45,32 @@ module Utils
|
|
52
45
|
lines.join("\n")
|
53
46
|
end
|
54
47
|
|
48
|
+
def breaklines(text,options={})
|
49
|
+
break_lines = []
|
50
|
+
lines = text_to_array(text)
|
51
|
+
length = options[:length] || guess_content_line_length(text)
|
52
|
+
lines.each do |line|
|
53
|
+
if line.length > 0
|
54
|
+
unless line_closed?(line,length)
|
55
|
+
break_lines << line
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
break_lines
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
def text_to_array(text)
|
64
|
+
page_lines = []
|
65
|
+
text.each_line do |line|
|
66
|
+
line.gsub!("\r\n","")
|
67
|
+
line.gsub!("\n","")
|
68
|
+
line.strip!
|
69
|
+
page_lines << line
|
70
|
+
end
|
71
|
+
page_lines
|
72
|
+
end
|
73
|
+
|
55
74
|
# 计算文本相似度
|
56
75
|
def text_similarity(text1,text2)
|
57
76
|
return 0 if text1.blank? || text2.blank?
|
@@ -101,6 +120,7 @@ module Utils
|
|
101
120
|
content.each_line{|line|
|
102
121
|
lengths << line.length
|
103
122
|
}
|
123
|
+
lengths.sort!
|
104
124
|
while true
|
105
125
|
line_length = lengths.pop
|
106
126
|
break if line_length < 80
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -155,7 +155,7 @@ files:
|
|
155
155
|
- bin/doc_book_import_mongo
|
156
156
|
- bin/xml2json
|
157
157
|
- lib/ebook_tools.rb
|
158
|
-
- lib/
|
158
|
+
- lib/txt_book.rb
|
159
159
|
- lib/header_detect.rb
|
160
160
|
- lib/pdf.rb
|
161
161
|
- lib/txt.rb
|