ebook_tools 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ebook_tools.gemspec +10 -9
- data/lib/ebook_tools.rb +32 -35
- data/lib/epub_book.rb +164 -0
- data/lib/paras_in_mongo.rb +1 -0
- data/lib/txt_book.rb +38 -18
- data/lib/utils.rb +15 -0
- metadata +8 -7
data/ebook_tools.gemspec
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.1.
|
5
|
+
s.version = '0.1.5'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
9
|
-
s.date = %q{2013-04
|
9
|
+
s.date = %q{2013-06-04}
|
10
10
|
s.description = %q{电子书工具集.}
|
11
|
-
s.email = %q{
|
11
|
+
s.email = %q{yalong1976@gmail.com}
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.requirements = ["none"]
|
14
14
|
s.summary = %q{电子书工具集.}
|
@@ -25,15 +25,16 @@ Gem::Specification.new do |s|
|
|
25
25
|
"bin/doc_book_import_mongo",
|
26
26
|
"bin/para_import_scheduling",
|
27
27
|
"bin/xml2json",
|
28
|
+
"lib/doc_book_in_mongo.rb",
|
28
29
|
"lib/ebook_tools.rb",
|
29
|
-
"lib/txt_book.rb",
|
30
|
-
"lib/header_detect.rb",
|
31
|
-
"lib/pdf.rb",
|
32
|
-
"lib/txt.rb",
|
33
30
|
"lib/epub.rb",
|
31
|
+
"lib/epub_book.rb",
|
32
|
+
"lib/header_detect.rb",
|
33
|
+
"lib/paras_in_mongo.rb",
|
34
|
+
"lib/pdf.rb",
|
35
|
+
"lib/txt.rb",
|
36
|
+
"lib/txt_book.rb",
|
34
37
|
"lib/utils.rb",
|
35
|
-
"lib/paras_in_mongo.rb",
|
36
|
-
"lib/doc_book_in_mongo.rb",
|
37
38
|
"workers/para_import_worker.rb",
|
38
39
|
"ebook_tools.gemspec"
|
39
40
|
]
|
data/lib/ebook_tools.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: UTF-8
|
3
|
-
['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
|
3
|
+
['utils','epub','txt','pdf','header_detect','txt_book','epub_book'].each do |file|
|
4
4
|
require File.join(File.dirname(__FILE__),file)
|
5
5
|
end
|
6
6
|
|
@@ -202,27 +202,34 @@ module EbookTools
|
|
202
202
|
|
203
203
|
def extract_book_struct_to_file(source,destination,options={})
|
204
204
|
options[:title] ||= File.basename(source,File.extname(source))
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
txt_book = TxtBook.new(content,options)
|
215
|
-
docbook_xml = txt_book.to_doc_book
|
216
|
-
if docbook_xml
|
217
|
-
FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
|
218
|
-
File.open(destination,'wb'){|file|file.write docbook_xml}
|
219
|
-
puts "目录结构:"
|
220
|
-
puts txt_book.toc_to_text
|
221
|
-
puts "共修复#{txt_book.breaklines_count}个断点."
|
222
|
-
return true
|
205
|
+
if File.extname(source) == '.epub'
|
206
|
+
epub_book = EpubBook.new(source,options)
|
207
|
+
docbook_xml = epub_book.to_doc_book
|
208
|
+
if docbook_xml
|
209
|
+
write_doc_book(destination,docbook_xml)
|
210
|
+
puts "目录结构:"
|
211
|
+
puts epub_book.toc_to_text
|
212
|
+
return true
|
213
|
+
end
|
223
214
|
else
|
224
|
-
|
215
|
+
content = case File.extname(source)
|
216
|
+
when '.html'
|
217
|
+
Utils.extract_text_from_file(source,'.html')
|
218
|
+
when '.txt'
|
219
|
+
File.open(source).read
|
220
|
+
end
|
221
|
+
txt_book = TxtBook.new(content,options)
|
222
|
+
docbook_xml = txt_book.to_doc_book
|
223
|
+
if docbook_xml
|
224
|
+
write_doc_book(destination,docbook_xml)
|
225
|
+
puts "目录结构:"
|
226
|
+
puts txt_book.toc_to_text
|
227
|
+
puts "共修复#{txt_book.breaklines_count}个断点."
|
228
|
+
return true
|
229
|
+
end
|
225
230
|
end
|
231
|
+
|
232
|
+
return nil
|
226
233
|
end
|
227
234
|
|
228
235
|
# batch_extract_from_dir
|
@@ -262,6 +269,11 @@ module EbookTools
|
|
262
269
|
end
|
263
270
|
end
|
264
271
|
|
272
|
+
def write_doc_book(destination, docbook_xml)
|
273
|
+
FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
|
274
|
+
File.open(destination,'wb'){|file|file.write docbook_xml}
|
275
|
+
end
|
276
|
+
|
265
277
|
# text_paras_repair
|
266
278
|
# 对文本文件格式中的中断段落进行修复
|
267
279
|
def text_paras_repair(source_file,target_file,options={})
|
@@ -271,21 +283,6 @@ module EbookTools
|
|
271
283
|
File.open(target_file,'w'){|file| file.write content}
|
272
284
|
end
|
273
285
|
|
274
|
-
def extract_text_from_file(filename,format)
|
275
|
-
txt_file = File.basename(filename,format)
|
276
|
-
if !filename.include?("'")
|
277
|
-
cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
|
278
|
-
elsif !filename.include?('"')
|
279
|
-
cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
|
280
|
-
else
|
281
|
-
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
282
|
-
end
|
283
|
-
output = `#{cmd}`
|
284
|
-
content = File.open("#{txt_file}.txt").read
|
285
|
-
FileUtils.remove_file("#{txt_file}.txt",true)
|
286
|
-
return content
|
287
|
-
end
|
288
|
-
|
289
286
|
# sanitize_for_epub_text
|
290
287
|
def sanitize_for_epub_text(content)
|
291
288
|
return content if content.blank?
|
data/lib/epub_book.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'uuid'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
# epub_book
|
6
|
+
# 处理EPUB书的类。
|
7
|
+
class EpubBook
|
8
|
+
attr_reader :title,:author,:publisher,:pubdate,:isbn,:content,:outline
|
9
|
+
|
10
|
+
def initialize(filename,options={})
|
11
|
+
raise '无效的文件' unless File.exists?(filename)
|
12
|
+
@title = options[:title]
|
13
|
+
@author = options[:author]
|
14
|
+
@publisher = options[:publisher]
|
15
|
+
@pubdate= options[:pubdate]
|
16
|
+
@isbn = options[:isbn]
|
17
|
+
|
18
|
+
text = Utils.extract_text_from_file(filename,'.epub')
|
19
|
+
unless Utils.detect_utf8(text)
|
20
|
+
text = Utils.to_utf8(text)
|
21
|
+
end
|
22
|
+
text = preprocess_content(text)
|
23
|
+
@outline, @content = extract_for_epub_text(text)
|
24
|
+
end
|
25
|
+
|
26
|
+
def toc_to_text
|
27
|
+
outline
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_doc_book
|
31
|
+
build_doc_book(@outline,@content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
private
|
36
|
+
def preprocess_content(content)
|
37
|
+
paras = extract_paras(content)
|
38
|
+
paras.join("\n")
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_paras(content)
|
42
|
+
paras = []
|
43
|
+
return paras if content.blank?
|
44
|
+
content.each_line do |line|
|
45
|
+
text = Utils.clean_text(line)
|
46
|
+
paras << text if text.length > 0
|
47
|
+
end
|
48
|
+
paras
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_doc_book(outline,content,options={})
|
52
|
+
doc_toc = gen_docbook_toc(outline.split("\n"))
|
53
|
+
|
54
|
+
doc_content = gen_docbook_content(content)
|
55
|
+
|
56
|
+
<<-EOS
|
57
|
+
<?xml version="1.0" encoding="utf-8"?>
|
58
|
+
<book xmlns="http://docbook.org/ns/docbook" version="5.0" id="#{UUID.generate}">
|
59
|
+
<info>
|
60
|
+
<title>#{options[:title]}</title>
|
61
|
+
<authorgroup>
|
62
|
+
<author><personname>#{options[:author]}</personname></author>
|
63
|
+
</authorgroup>
|
64
|
+
<pubdate>#{options[:pubdate]}</pubdate>
|
65
|
+
<publisher><publishername>#{options[:publisher]}</publishername></publisher>
|
66
|
+
</info>
|
67
|
+
#{doc_toc}
|
68
|
+
#{doc_content}
|
69
|
+
</book>
|
70
|
+
EOS
|
71
|
+
end
|
72
|
+
|
73
|
+
def gen_docbook_toc(toc)
|
74
|
+
"<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
|
75
|
+
end
|
76
|
+
|
77
|
+
def gen_docbook_tocdiv(toc)
|
78
|
+
toc.map do |item|
|
79
|
+
text = Utils.escape_html(Utils.clean_text(item))
|
80
|
+
"<tocdiv><title>#{item}</title></tocdiv>"
|
81
|
+
end.join("")
|
82
|
+
end
|
83
|
+
|
84
|
+
def gen_docbook_content(content)
|
85
|
+
paras = extract_paras(content)
|
86
|
+
paras_content = paras.map do |para|
|
87
|
+
text = Utils.escape_html(Utils.clean_text(para))
|
88
|
+
"<para id='#{UUID.generate}'>#{text}</para>"
|
89
|
+
end.join("\n")
|
90
|
+
"<sect1>#{paras_content}</sect1>"
|
91
|
+
end
|
92
|
+
|
93
|
+
# extract_for_epub_text
|
94
|
+
def extract_for_epub_text(content)
|
95
|
+
return content if content.blank?
|
96
|
+
if outline_type?(content)
|
97
|
+
extract_outline_with_content(content)
|
98
|
+
elsif toc_type?(content)
|
99
|
+
extract_toc_with_content(content)
|
100
|
+
else
|
101
|
+
['',content]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def extract_outline_with_content(content)
|
106
|
+
outline_flag = false
|
107
|
+
lines = []
|
108
|
+
outline = []
|
109
|
+
content.each_line do |line|
|
110
|
+
if line.strip.downcase == 'document outline'
|
111
|
+
outline_flag = true
|
112
|
+
next
|
113
|
+
end
|
114
|
+
unless outline_flag
|
115
|
+
lines << line
|
116
|
+
else
|
117
|
+
outline << line;
|
118
|
+
end
|
119
|
+
end
|
120
|
+
[outline.join("") , lines.join("")]
|
121
|
+
end
|
122
|
+
|
123
|
+
def extract_toc_with_content(content)
|
124
|
+
toc = []
|
125
|
+
lines = []
|
126
|
+
|
127
|
+
paras = extract_paras(content)
|
128
|
+
index = paras.index('Content')
|
129
|
+
paras = paras[(index+1)..-1]
|
130
|
+
|
131
|
+
point = nil
|
132
|
+
|
133
|
+
paras.each_with_index do |para, index|
|
134
|
+
if toc.include?(para)
|
135
|
+
point = index
|
136
|
+
break
|
137
|
+
else
|
138
|
+
toc << para
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
lines = paras[(point+1)..-1]
|
143
|
+
[toc.join("\n") , lines.join("\n")]
|
144
|
+
rescue
|
145
|
+
['', content]
|
146
|
+
end
|
147
|
+
|
148
|
+
def outline_type?(content)
|
149
|
+
content.each_line do |line|
|
150
|
+
return true if line.strip.downcase == 'document outline'
|
151
|
+
end
|
152
|
+
false
|
153
|
+
end
|
154
|
+
|
155
|
+
def toc_type?(content)
|
156
|
+
toc_flag = false
|
157
|
+
toc_flag1 = false
|
158
|
+
content.each_line do |line|
|
159
|
+
toc_flag = true if line.strip.downcase == '目录'
|
160
|
+
toc_flag1 = true if line.strip.downcase == 'content'
|
161
|
+
end
|
162
|
+
toc_flag && toc_flag1
|
163
|
+
end
|
164
|
+
end
|
data/lib/paras_in_mongo.rb
CHANGED
data/lib/txt_book.rb
CHANGED
@@ -87,18 +87,22 @@ class TxtBook
|
|
87
87
|
end
|
88
88
|
|
89
89
|
def toc
|
90
|
-
@toc ||= extract_toc_from_struct(struct_content)
|
90
|
+
@toc ||= extract_toc_from_struct(struct_content) if struct_content
|
91
91
|
end
|
92
92
|
|
93
93
|
def toc_to_text
|
94
|
-
|
95
|
-
|
94
|
+
if toc
|
95
|
+
gen_toc(toc) do |item,children|
|
96
|
+
"#{item[:title]}\n#{children}"
|
97
|
+
end
|
96
98
|
end
|
97
99
|
end
|
98
100
|
|
99
101
|
def to_doc_book
|
100
102
|
if struct_content
|
101
103
|
build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
104
|
+
else
|
105
|
+
build_doc_book(content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
|
102
106
|
end
|
103
107
|
end
|
104
108
|
|
@@ -184,12 +188,15 @@ class TxtBook
|
|
184
188
|
marked_content
|
185
189
|
end
|
186
190
|
|
187
|
-
def build_doc_book(
|
188
|
-
doc_toc = gen_docbook_toc(toc)
|
191
|
+
def build_doc_book(content,options={})
|
192
|
+
doc_toc = gen_docbook_toc(toc) if toc
|
189
193
|
|
190
|
-
|
191
|
-
|
192
|
-
|
194
|
+
doc_content = if content.is_a?(Array)
|
195
|
+
struct = content.map{|item| item if item.is_a?(Hash)}.compact
|
196
|
+
gen_docbook_content_with_struct(struct)
|
197
|
+
else
|
198
|
+
gen_docbook_content(content)
|
199
|
+
end
|
193
200
|
|
194
201
|
<<-EOS
|
195
202
|
<?xml version="1.0" encoding="utf-8"?>
|
@@ -208,6 +215,8 @@ class TxtBook
|
|
208
215
|
EOS
|
209
216
|
end
|
210
217
|
|
218
|
+
|
219
|
+
|
211
220
|
def build_struct(content)
|
212
221
|
stack = Array.new(8)
|
213
222
|
struct = []
|
@@ -314,18 +323,20 @@ EOS
|
|
314
323
|
end
|
315
324
|
|
316
325
|
def extract_toc_from_struct(struct)
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
326
|
+
if struct
|
327
|
+
toc = []
|
328
|
+
struct.each do |item|
|
329
|
+
if item.is_a?(Hash)
|
330
|
+
children = []
|
331
|
+
if item[:children].any?
|
332
|
+
children = extract_toc_from_struct(item[:children])
|
333
|
+
end
|
334
|
+
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
335
|
+
toc << item_hash
|
323
336
|
end
|
324
|
-
item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
|
325
|
-
toc << item_hash
|
326
337
|
end
|
338
|
+
toc
|
327
339
|
end
|
328
|
-
toc
|
329
340
|
end
|
330
341
|
|
331
342
|
def gen_docbook_toc(toc)
|
@@ -350,7 +361,16 @@ EOS
|
|
350
361
|
end
|
351
362
|
end
|
352
363
|
|
353
|
-
def gen_docbook_content(
|
364
|
+
def gen_docbook_content(content)
|
365
|
+
paras = extract_paras(content)
|
366
|
+
paras_content = paras.map do |para|
|
367
|
+
text = Utils.escape_html(Utils.clean_text(para))
|
368
|
+
"<para id='#{UUID.generate}'>#{text}</para>"
|
369
|
+
end.join("\n")
|
370
|
+
"<sect1>#{paras_content}</sect1>"
|
371
|
+
end
|
372
|
+
|
373
|
+
def gen_docbook_content_with_struct(struct)
|
354
374
|
content = []
|
355
375
|
struct.each do |item|
|
356
376
|
if item.is_a?(Hash)
|
data/lib/utils.rb
CHANGED
@@ -256,4 +256,19 @@ module Utils
|
|
256
256
|
sections
|
257
257
|
end
|
258
258
|
|
259
|
+
def extract_text_from_file(filename,format)
|
260
|
+
txt_file = File.basename(filename,format)
|
261
|
+
if !filename.include?("'")
|
262
|
+
cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
|
263
|
+
elsif !filename.include?('"')
|
264
|
+
cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
|
265
|
+
else
|
266
|
+
cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
|
267
|
+
end
|
268
|
+
output = `#{cmd}`
|
269
|
+
content = File.open("#{txt_file}.txt").read
|
270
|
+
FileUtils.remove_file("#{txt_file}.txt",true)
|
271
|
+
return content
|
272
|
+
end
|
273
|
+
|
259
274
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04
|
12
|
+
date: 2013-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: uuid
|
@@ -156,7 +156,7 @@ dependencies:
|
|
156
156
|
- !ruby/object:Gem::Version
|
157
157
|
version: '0'
|
158
158
|
description: 电子书工具集.
|
159
|
-
email:
|
159
|
+
email: yalong1976@gmail.com
|
160
160
|
executables:
|
161
161
|
- ebook_tools
|
162
162
|
- para_import_mongo
|
@@ -171,15 +171,16 @@ files:
|
|
171
171
|
- bin/doc_book_import_mongo
|
172
172
|
- bin/para_import_scheduling
|
173
173
|
- bin/xml2json
|
174
|
+
- lib/doc_book_in_mongo.rb
|
174
175
|
- lib/ebook_tools.rb
|
175
|
-
- lib/
|
176
|
+
- lib/epub.rb
|
177
|
+
- lib/epub_book.rb
|
176
178
|
- lib/header_detect.rb
|
179
|
+
- lib/paras_in_mongo.rb
|
177
180
|
- lib/pdf.rb
|
178
181
|
- lib/txt.rb
|
179
|
-
- lib/
|
182
|
+
- lib/txt_book.rb
|
180
183
|
- lib/utils.rb
|
181
|
-
- lib/paras_in_mongo.rb
|
182
|
-
- lib/doc_book_in_mongo.rb
|
183
184
|
- workers/para_import_worker.rb
|
184
185
|
- ebook_tools.gemspec
|
185
186
|
homepage:
|