ebook_tools 0.0.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +3 -0
- data/ebook_tools.gemspec +2 -2
- data/lib/ebook_tools.rb +45 -11
- data/lib/{extract_book_struct.rb → txt_book.rb} +94 -154
- data/lib/utils.rb +28 -8
- metadata +2 -2
data/CHANGELOG
CHANGED
data/ebook_tools.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.0
|
5
|
+
s.version = '0.1.0'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
|
|
25
25
|
"bin/doc_book_import_mongo",
|
26
26
|
"bin/xml2json",
|
27
27
|
"lib/ebook_tools.rb",
|
28
|
-
"lib/
|
28
|
+
"lib/txt_book.rb",
|
29
29
|
"lib/header_detect.rb",
|
30
30
|
"lib/pdf.rb",
|
31
31
|
"lib/txt.rb",
|
data/lib/ebook_tools.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: UTF-8
|
3
|
-
['utils','epub','txt','pdf','header_detect','
|
3
|
+
['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
|
4
4
|
require File.join(File.dirname(__FILE__),file)
|
5
5
|
end
|
6
6
|
|
@@ -201,16 +201,27 @@ module EbookTools
|
|
201
201
|
end
|
202
202
|
|
203
203
|
def extract_book_struct_to_file(source,destination,options={})
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
204
|
+
options[:title] ||= File.basename(source,File.extname(source))
|
205
|
+
content = case File.extname(source)
|
206
|
+
when '.html'
|
207
|
+
extract_text_from_file(source,'.html')
|
208
|
+
when '.epub'
|
209
|
+
text = extract_text_from_file(source,'.epub')
|
210
|
+
sanitize_for_epub_text(text)
|
211
|
+
when '.txt'
|
212
|
+
File.open(source).read
|
213
|
+
end
|
214
|
+
txt_book = TxtBook.new(content,options)
|
215
|
+
docbook_xml = txt_book.to_doc_book
|
216
|
+
if docbook_xml
|
217
|
+
FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
|
218
|
+
File.open(destination,'wb'){|file|file.write docbook_xml}
|
219
|
+
puts "目录结构:"
|
220
|
+
puts txt_book.toc_to_text
|
221
|
+
puts "共修复#{txt_book.breaklines_count}个断点."
|
222
|
+
return true
|
223
|
+
else
|
224
|
+
return nil
|
214
225
|
end
|
215
226
|
end
|
216
227
|
|
@@ -254,4 +265,27 @@ module EbookTools
|
|
254
265
|
content = Utils.fixed_page_break(content,options)
|
255
266
|
File.open(target_file,'w'){|file| file.write content}
|
256
267
|
end
|
268
|
+
|
269
|
+
def extract_text_from_file(filename,format)
|
270
|
+
txt_file = File.basename(filename,format)
|
271
|
+
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
272
|
+
output = `#{cmd}`
|
273
|
+
content = File.open("#{txt_file}.txt").read
|
274
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
275
|
+
return content
|
276
|
+
end
|
277
|
+
|
278
|
+
# sanitize_for_epub_text
|
279
|
+
def sanitize_for_epub_text(content)
|
280
|
+
return content if content.blank?
|
281
|
+
lines = []
|
282
|
+
content.each_line do |line|
|
283
|
+
unless line.downcase.include?('document outline')
|
284
|
+
lines << line
|
285
|
+
else
|
286
|
+
break;
|
287
|
+
end
|
288
|
+
end
|
289
|
+
lines.join("")
|
290
|
+
end
|
257
291
|
end
|
@@ -1,7 +1,10 @@
|
|
1
|
-
# encoding:
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'uuid'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
#=文本书籍
|
6
|
+
# 处理TXT格式的书籍。
|
7
|
+
#
|
5
8
|
# 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
|
6
9
|
# 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
|
7
10
|
# 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
|
@@ -9,145 +12,83 @@
|
|
9
12
|
# 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
|
10
13
|
# 6. 每个结构信息都应该独立成行。
|
11
14
|
#
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
# === ExtractBookStruct.from_txt
|
16
|
-
# 从文本文件中提取目录结构
|
17
|
-
#
|
18
|
-
# === ExtractBookStruct.from_epub
|
19
|
-
# 从EPUB文件中提取目录结构
|
20
|
-
#
|
21
|
-
# === ExtractBookStruct.from_html
|
22
|
-
# 从HTML中提取目录结构
|
15
|
+
class TxtBook
|
16
|
+
include HeaderDetect
|
17
|
+
attr_reader :title,:author,:publisher,:pubdate,:isbn,:content
|
23
18
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
module ExtractBookStruct
|
29
|
-
extend self
|
30
|
-
extend HeaderDetect
|
31
|
-
|
32
|
-
def from_txt(filename,options={})
|
33
|
-
options[:title] ||= File.basename(filename,File.extname(filename))
|
19
|
+
def self.load(filename,options={})
|
20
|
+
raise '无效的文件' unless File.exists?(filename)
|
21
|
+
options[:title] = File.basename(filename, File.extname(filename))
|
34
22
|
content = File.open(filename).read
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
def from_html(filename,options={})
|
39
|
-
options[:title] ||= File.basename(filename,File.extname(filename))
|
40
|
-
content = extract_text_from_file(filename,'.html')
|
41
|
-
extract_book_struct(content,options)
|
42
|
-
end
|
43
|
-
|
44
|
-
def from_epub(filename,options={})
|
45
|
-
options[:title] ||= File.basename(filename,File.extname(filename))
|
46
|
-
content = extract_text_from_file(filename,'.epub')
|
47
|
-
extract_book_struct(content,options)
|
23
|
+
new(content,options)
|
48
24
|
end
|
25
|
+
|
26
|
+
def initialize(content,options={})
|
27
|
+
@title = options[:title]
|
28
|
+
@author = options[:author]
|
29
|
+
@publisher = options[:publisher]
|
30
|
+
@pubdate= options[:pubdate]
|
31
|
+
@isbn = options[:isbn]
|
32
|
+
@format = options[:format]
|
49
33
|
|
50
|
-
def extract_book_struct(content,options={})
|
51
34
|
unless Utils.detect_utf8(content)
|
52
35
|
content = Utils.to_utf8(content)
|
53
36
|
end
|
54
|
-
content =
|
55
|
-
paras = extract_paras(content)
|
56
|
-
# 检查书类型(text,digital,hybrid)
|
57
|
-
format = options[:format] || detect_struct_type(paras)
|
58
|
-
case format
|
59
|
-
when :text
|
60
|
-
extract_text_book_struct(paras,options)
|
61
|
-
when :digital
|
62
|
-
extract_digital_book_struct(paras,options)
|
63
|
-
when :hybrid
|
64
|
-
extract_hybrid_book_struct(paras,options)
|
65
|
-
else
|
66
|
-
return nil
|
67
|
-
end
|
37
|
+
@content = content
|
68
38
|
end
|
69
39
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
40
|
+
def struct_content
|
41
|
+
return @struct_content if @struct_content
|
42
|
+
content = if breaklines_count > 100
|
43
|
+
Utils.fixed_page_break(@content)
|
44
|
+
else
|
45
|
+
@content
|
46
|
+
end
|
47
|
+
@struct_content = extract_book_struct(content,:format=>@format)
|
77
48
|
end
|
78
49
|
|
79
|
-
def
|
80
|
-
|
81
|
-
return paras if content.blank?
|
82
|
-
content.each_line do |line|
|
83
|
-
text = Utils.clean_text(line)
|
84
|
-
paras << text if text.length > 0
|
85
|
-
end
|
86
|
-
paras
|
50
|
+
def breaklines
|
51
|
+
@breaklines ||= Utils.breaklines(content)
|
87
52
|
end
|
88
53
|
|
89
|
-
def
|
90
|
-
|
91
|
-
digital_flag = false
|
92
|
-
paras.each do |para|
|
93
|
-
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
94
|
-
text_flag = true
|
95
|
-
end
|
96
|
-
|
97
|
-
if guess_digital_header?(para)
|
98
|
-
digital_flag = true
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
if text_flag && digital_flag
|
103
|
-
:hybrid
|
104
|
-
elsif text_flag
|
105
|
-
:text
|
106
|
-
elsif digital_flag
|
107
|
-
:digital
|
108
|
-
else
|
109
|
-
:unknown
|
110
|
-
end
|
54
|
+
def breaklines_count
|
55
|
+
breaklines.count
|
111
56
|
end
|
112
57
|
|
113
|
-
|
114
|
-
|
115
|
-
# 标注结构信息
|
116
|
-
marked_content = mark_struct_info(content)
|
117
|
-
# 构建书结构
|
118
|
-
struct = build_struct(marked_content)
|
119
|
-
# 修正结构
|
120
|
-
revised_struct = revise_struct(struct)
|
121
|
-
# 生成docbook
|
122
|
-
build_doc_book(revised_struct,options)
|
58
|
+
def toc
|
59
|
+
@toc ||= extract_toc_from_struct(struct_content)
|
123
60
|
end
|
124
61
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
# 构建书结构
|
130
|
-
struct = build_struct(marked_content)
|
131
|
-
|
132
|
-
# 修正结构
|
133
|
-
revised_struct = revise_struct(struct)
|
134
|
-
|
135
|
-
# 生成docbook
|
136
|
-
build_doc_book(revised_struct,options)
|
62
|
+
def toc_to_text
|
63
|
+
gen_toc(toc) do |item,children|
|
64
|
+
"#{item[:title]}\n#{children}"
|
65
|
+
end
|
137
66
|
end
|
138
67
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
struct = build_struct(marked_content)
|
145
|
-
|
146
|
-
# 修正结构
|
147
|
-
revised_struct = revise_struct(struct)
|
68
|
+
def to_doc_book
|
69
|
+
if struct_content
|
70
|
+
build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
71
|
+
end
|
72
|
+
end
|
148
73
|
|
149
|
-
|
150
|
-
|
74
|
+
private
|
75
|
+
def extract_book_struct(content,options={})
|
76
|
+
paras = extract_paras(content)
|
77
|
+
# 检查书类型(text,digital,hybrid)
|
78
|
+
format = options[:format] || detect_struct_type(paras)
|
79
|
+
marked_content = case format
|
80
|
+
when :text
|
81
|
+
mark_struct_info(paras)
|
82
|
+
when :digital
|
83
|
+
mark_digital_struct_info(paras)
|
84
|
+
when :hybrid
|
85
|
+
mark_hybrid_struct_info(paras)
|
86
|
+
else
|
87
|
+
return nil
|
88
|
+
end
|
89
|
+
if marked_content
|
90
|
+
build_struct(marked_content)
|
91
|
+
end
|
151
92
|
end
|
152
93
|
|
153
94
|
# 标注结构信息
|
@@ -207,14 +148,7 @@ module ExtractBookStruct
|
|
207
148
|
marked_content
|
208
149
|
end
|
209
150
|
|
210
|
-
# 修正结构 TODO
|
211
|
-
def revise_struct(struct)
|
212
|
-
struct
|
213
|
-
end
|
214
|
-
|
215
151
|
def build_doc_book(struct,options={})
|
216
|
-
toc = extract_toc_from_struct(struct)
|
217
|
-
|
218
152
|
doc_toc = gen_docbook_toc(toc)
|
219
153
|
|
220
154
|
struct = struct.map{|item| item if item.is_a?(Hash)}.compact
|
@@ -380,20 +314,6 @@ EOS
|
|
380
314
|
end
|
381
315
|
end
|
382
316
|
|
383
|
-
=begin
|
384
|
-
def gen_docbook_tocdiv(toc)
|
385
|
-
doc_toc = []
|
386
|
-
toc.each do |item|
|
387
|
-
children = ""
|
388
|
-
if item[:children].any?
|
389
|
-
children = gen_docbook_tocdiv(item[:children])
|
390
|
-
end
|
391
|
-
doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
|
392
|
-
end
|
393
|
-
doc_toc.join("")
|
394
|
-
end
|
395
|
-
=end
|
396
|
-
|
397
317
|
def gen_docbook_content(struct)
|
398
318
|
content = []
|
399
319
|
struct.each do |item|
|
@@ -420,17 +340,37 @@ EOS
|
|
420
340
|
content.join("\n")
|
421
341
|
end
|
422
342
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
lines << line
|
430
|
-
else
|
431
|
-
break;
|
343
|
+
def detect_struct_type(paras)
|
344
|
+
text_flag = false
|
345
|
+
digital_flag = false
|
346
|
+
paras.each do |para|
|
347
|
+
if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
|
348
|
+
text_flag = true
|
432
349
|
end
|
350
|
+
|
351
|
+
if guess_digital_header?(para)
|
352
|
+
digital_flag = true
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
if text_flag && digital_flag
|
357
|
+
:hybrid
|
358
|
+
elsif text_flag
|
359
|
+
:text
|
360
|
+
elsif digital_flag
|
361
|
+
:digital
|
362
|
+
else
|
363
|
+
:unknown
|
433
364
|
end
|
434
|
-
|
365
|
+
end
|
366
|
+
|
367
|
+
def extract_paras(content)
|
368
|
+
paras = []
|
369
|
+
return paras if content.blank?
|
370
|
+
content.each_line do |line|
|
371
|
+
text = Utils.clean_text(line)
|
372
|
+
paras << text if text.length > 0
|
373
|
+
end
|
374
|
+
paras
|
435
375
|
end
|
436
376
|
end
|
data/lib/utils.rb
CHANGED
@@ -23,15 +23,8 @@ module Utils
|
|
23
23
|
# parameters:
|
24
24
|
# +page_text+ 文本内容
|
25
25
|
def fixed_page_break(page_text,options={})
|
26
|
-
page_lines = []
|
27
26
|
length = options[:length] || guess_content_line_length(page_text)
|
28
|
-
|
29
|
-
page_text.each_line do |line|
|
30
|
-
line.gsub!("\r\n","")
|
31
|
-
line.gsub!("\n","")
|
32
|
-
line.strip!
|
33
|
-
page_lines << line
|
34
|
-
end
|
27
|
+
page_lines = text_to_array(page_text)
|
35
28
|
|
36
29
|
lines = []
|
37
30
|
flag_tag = false
|
@@ -52,6 +45,32 @@ module Utils
|
|
52
45
|
lines.join("\n")
|
53
46
|
end
|
54
47
|
|
48
|
+
def breaklines(text,options={})
|
49
|
+
break_lines = []
|
50
|
+
lines = text_to_array(text)
|
51
|
+
length = options[:length] || guess_content_line_length(text)
|
52
|
+
lines.each do |line|
|
53
|
+
if line.length > 0
|
54
|
+
unless line_closed?(line,length)
|
55
|
+
break_lines << line
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
break_lines
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
def text_to_array(text)
|
64
|
+
page_lines = []
|
65
|
+
text.each_line do |line|
|
66
|
+
line.gsub!("\r\n","")
|
67
|
+
line.gsub!("\n","")
|
68
|
+
line.strip!
|
69
|
+
page_lines << line
|
70
|
+
end
|
71
|
+
page_lines
|
72
|
+
end
|
73
|
+
|
55
74
|
# 计算文本相似度
|
56
75
|
def text_similarity(text1,text2)
|
57
76
|
return 0 if text1.blank? || text2.blank?
|
@@ -101,6 +120,7 @@ module Utils
|
|
101
120
|
content.each_line{|line|
|
102
121
|
lengths << line.length
|
103
122
|
}
|
123
|
+
lengths.sort!
|
104
124
|
while true
|
105
125
|
line_length = lengths.pop
|
106
126
|
break if line_length < 80
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -155,7 +155,7 @@ files:
|
|
155
155
|
- bin/doc_book_import_mongo
|
156
156
|
- bin/xml2json
|
157
157
|
- lib/ebook_tools.rb
|
158
|
-
- lib/
|
158
|
+
- lib/txt_book.rb
|
159
159
|
- lib/header_detect.rb
|
160
160
|
- lib/pdf.rb
|
161
161
|
- lib/txt.rb
|