ebook_tools 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/ebook_tools.gemspec CHANGED
@@ -2,13 +2,13 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.1.4'
5
+ s.version = '0.1.5'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
9
- s.date = %q{2013-04-05}
9
+ s.date = %q{2013-06-04}
10
10
  s.description = %q{电子书工具集.}
11
- s.email = %q{aaron@nonobo.com}
11
+ s.email = %q{yalong1976@gmail.com}
12
12
  s.require_paths = ["lib"]
13
13
  s.requirements = ["none"]
14
14
  s.summary = %q{电子书工具集.}
@@ -25,15 +25,16 @@ Gem::Specification.new do |s|
25
25
  "bin/doc_book_import_mongo",
26
26
  "bin/para_import_scheduling",
27
27
  "bin/xml2json",
28
+ "lib/doc_book_in_mongo.rb",
28
29
  "lib/ebook_tools.rb",
29
- "lib/txt_book.rb",
30
- "lib/header_detect.rb",
31
- "lib/pdf.rb",
32
- "lib/txt.rb",
33
30
  "lib/epub.rb",
31
+ "lib/epub_book.rb",
32
+ "lib/header_detect.rb",
33
+ "lib/paras_in_mongo.rb",
34
+ "lib/pdf.rb",
35
+ "lib/txt.rb",
36
+ "lib/txt_book.rb",
34
37
  "lib/utils.rb",
35
- "lib/paras_in_mongo.rb",
36
- "lib/doc_book_in_mongo.rb",
37
38
  "workers/para_import_worker.rb",
38
39
  "ebook_tools.gemspec"
39
40
  ]
data/lib/ebook_tools.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # encoding: UTF-8
3
- ['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
3
+ ['utils','epub','txt','pdf','header_detect','txt_book','epub_book'].each do |file|
4
4
  require File.join(File.dirname(__FILE__),file)
5
5
  end
6
6
 
@@ -202,27 +202,34 @@ module EbookTools
202
202
 
203
203
  def extract_book_struct_to_file(source,destination,options={})
204
204
  options[:title] ||= File.basename(source,File.extname(source))
205
- content = case File.extname(source)
206
- when '.html'
207
- extract_text_from_file(source,'.html')
208
- when '.epub'
209
- text = extract_text_from_file(source,'.epub')
210
- sanitize_for_epub_text(text)
211
- when '.txt'
212
- File.open(source).read
213
- end
214
- txt_book = TxtBook.new(content,options)
215
- docbook_xml = txt_book.to_doc_book
216
- if docbook_xml
217
- FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
218
- File.open(destination,'wb'){|file|file.write docbook_xml}
219
- puts "目录结构:"
220
- puts txt_book.toc_to_text
221
- puts "共修复#{txt_book.breaklines_count}个断点."
222
- return true
205
+ if File.extname(source) == '.epub'
206
+ epub_book = EpubBook.new(source,options)
207
+ docbook_xml = epub_book.to_doc_book
208
+ if docbook_xml
209
+ write_doc_book(destination,docbook_xml)
210
+ puts "目录结构:"
211
+ puts epub_book.toc_to_text
212
+ return true
213
+ end
223
214
  else
224
- return nil
215
+ content = case File.extname(source)
216
+ when '.html'
217
+ Utils.extract_text_from_file(source,'.html')
218
+ when '.txt'
219
+ File.open(source).read
220
+ end
221
+ txt_book = TxtBook.new(content,options)
222
+ docbook_xml = txt_book.to_doc_book
223
+ if docbook_xml
224
+ write_doc_book(destination,docbook_xml)
225
+ puts "目录结构:"
226
+ puts txt_book.toc_to_text
227
+ puts "共修复#{txt_book.breaklines_count}个断点."
228
+ return true
229
+ end
225
230
  end
231
+
232
+ return nil
226
233
  end
227
234
 
228
235
  # batch_extract_from_dir
@@ -262,6 +269,11 @@ module EbookTools
262
269
  end
263
270
  end
264
271
 
272
+ def write_doc_book(destination, docbook_xml)
273
+ FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
274
+ File.open(destination,'wb'){|file|file.write docbook_xml}
275
+ end
276
+
265
277
  # text_paras_repair
266
278
  # 对文本文件格式中的中断段落进行修复
267
279
  def text_paras_repair(source_file,target_file,options={})
@@ -271,21 +283,6 @@ module EbookTools
271
283
  File.open(target_file,'w'){|file| file.write content}
272
284
  end
273
285
 
274
- def extract_text_from_file(filename,format)
275
- txt_file = File.basename(filename,format)
276
- if !filename.include?("'")
277
- cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
278
- elsif !filename.include?('"')
279
- cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
280
- else
281
- cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
282
- end
283
- output = `#{cmd}`
284
- content = File.open("#{txt_file}.txt").read
285
- FileUtils.remove_file("#{txt_file}.txt",true)
286
- return content
287
- end
288
-
289
286
  # sanitize_for_epub_text
290
287
  def sanitize_for_epub_text(content)
291
288
  return content if content.blank?
data/lib/epub_book.rb ADDED
@@ -0,0 +1,164 @@
1
+ # encoding: utf-8
2
+ require 'uuid'
3
+ require 'cgi'
4
+
5
+ # epub_book
6
+ # 处理EPUB书的类。
7
+ class EpubBook
8
+ attr_reader :title,:author,:publisher,:pubdate,:isbn,:content,:outline
9
+
10
+ def initialize(filename,options={})
11
+ raise '无效的文件' unless File.exists?(filename)
12
+ @title = options[:title]
13
+ @author = options[:author]
14
+ @publisher = options[:publisher]
15
+ @pubdate= options[:pubdate]
16
+ @isbn = options[:isbn]
17
+
18
+ text = Utils.extract_text_from_file(filename,'.epub')
19
+ unless Utils.detect_utf8(text)
20
+ text = Utils.to_utf8(text)
21
+ end
22
+ text = preprocess_content(text)
23
+ @outline, @content = extract_for_epub_text(text)
24
+ end
25
+
26
+ def toc_to_text
27
+ outline
28
+ end
29
+
30
+ def to_doc_book
31
+ build_doc_book(@outline,@content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
32
+ end
33
+
34
+
35
+ private
36
+ def preprocess_content(content)
37
+ paras = extract_paras(content)
38
+ paras.join("\n")
39
+ end
40
+
41
+ def extract_paras(content)
42
+ paras = []
43
+ return paras if content.blank?
44
+ content.each_line do |line|
45
+ text = Utils.clean_text(line)
46
+ paras << text if text.length > 0
47
+ end
48
+ paras
49
+ end
50
+
51
+ def build_doc_book(outline,content,options={})
52
+ doc_toc = gen_docbook_toc(outline.split("\n"))
53
+
54
+ doc_content = gen_docbook_content(content)
55
+
56
+ <<-EOS
57
+ <?xml version="1.0" encoding="utf-8"?>
58
+ <book xmlns="http://docbook.org/ns/docbook" version="5.0" id="#{UUID.generate}">
59
+ <info>
60
+ <title>#{options[:title]}</title>
61
+ <authorgroup>
62
+ <author><personname>#{options[:author]}</personname></author>
63
+ </authorgroup>
64
+ <pubdate>#{options[:pubdate]}</pubdate>
65
+ <publisher><publishername>#{options[:publisher]}</publishername></publisher>
66
+ </info>
67
+ #{doc_toc}
68
+ #{doc_content}
69
+ </book>
70
+ EOS
71
+ end
72
+
73
+ def gen_docbook_toc(toc)
74
+ "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
75
+ end
76
+
77
+ def gen_docbook_tocdiv(toc)
78
+ toc.map do |item|
79
+ text = Utils.escape_html(Utils.clean_text(item))
80
+ "<tocdiv><title>#{item}</title></tocdiv>"
81
+ end.join("")
82
+ end
83
+
84
+ def gen_docbook_content(content)
85
+ paras = extract_paras(content)
86
+ paras_content = paras.map do |para|
87
+ text = Utils.escape_html(Utils.clean_text(para))
88
+ "<para id='#{UUID.generate}'>#{text}</para>"
89
+ end.join("\n")
90
+ "<sect1>#{paras_content}</sect1>"
91
+ end
92
+
93
+ # extract_for_epub_text
94
+ def extract_for_epub_text(content)
95
+ return content if content.blank?
96
+ if outline_type?(content)
97
+ extract_outline_with_content(content)
98
+ elsif toc_type?(content)
99
+ extract_toc_with_content(content)
100
+ else
101
+ ['',content]
102
+ end
103
+ end
104
+
105
+ def extract_outline_with_content(content)
106
+ outline_flag = false
107
+ lines = []
108
+ outline = []
109
+ content.each_line do |line|
110
+ if line.strip.downcase == 'document outline'
111
+ outline_flag = true
112
+ next
113
+ end
114
+ unless outline_flag
115
+ lines << line
116
+ else
117
+ outline << line;
118
+ end
119
+ end
120
+ [outline.join("") , lines.join("")]
121
+ end
122
+
123
+ def extract_toc_with_content(content)
124
+ toc = []
125
+ lines = []
126
+
127
+ paras = extract_paras(content)
128
+ index = paras.index('Content')
129
+ paras = paras[(index+1)..-1]
130
+
131
+ point = nil
132
+
133
+ paras.each_with_index do |para, index|
134
+ if toc.include?(para)
135
+ point = index
136
+ break
137
+ else
138
+ toc << para
139
+ end
140
+ end
141
+
142
+ lines = paras[(point+1)..-1]
143
+ [toc.join("\n") , lines.join("\n")]
144
+ rescue
145
+ ['', content]
146
+ end
147
+
148
+ def outline_type?(content)
149
+ content.each_line do |line|
150
+ return true if line.strip.downcase == 'document outline'
151
+ end
152
+ false
153
+ end
154
+
155
+ def toc_type?(content)
156
+ toc_flag = false
157
+ toc_flag1 = false
158
+ content.each_line do |line|
159
+ toc_flag = true if line.strip.downcase == '目录'
160
+ toc_flag1 = true if line.strip.downcase == 'content'
161
+ end
162
+ toc_flag && toc_flag1
163
+ end
164
+ end
@@ -2,6 +2,7 @@
2
2
  require 'moped'
3
3
  require 'nokogiri'
4
4
  require 'active_support'
5
+ require 'active_support/core_ext/hash'
5
6
 
6
7
  module ParasInMongo
7
8
  extend self
data/lib/txt_book.rb CHANGED
@@ -87,18 +87,22 @@ class TxtBook
87
87
  end
88
88
 
89
89
  def toc
90
- @toc ||= extract_toc_from_struct(struct_content)
90
+ @toc ||= extract_toc_from_struct(struct_content) if struct_content
91
91
  end
92
92
 
93
93
  def toc_to_text
94
- gen_toc(toc) do |item,children|
95
- "#{item[:title]}\n#{children}"
94
+ if toc
95
+ gen_toc(toc) do |item,children|
96
+ "#{item[:title]}\n#{children}"
97
+ end
96
98
  end
97
99
  end
98
100
 
99
101
  def to_doc_book
100
102
  if struct_content
101
103
  build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
104
+ else
105
+ build_doc_book(content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
102
106
  end
103
107
  end
104
108
 
@@ -184,12 +188,15 @@ class TxtBook
184
188
  marked_content
185
189
  end
186
190
 
187
- def build_doc_book(struct,options={})
188
- doc_toc = gen_docbook_toc(toc)
191
+ def build_doc_book(content,options={})
192
+ doc_toc = gen_docbook_toc(toc) if toc
189
193
 
190
- struct = struct.map{|item| item if item.is_a?(Hash)}.compact
191
-
192
- doc_content = gen_docbook_content(struct)
194
+ doc_content = if content.is_a?(Array)
195
+ struct = content.map{|item| item if item.is_a?(Hash)}.compact
196
+ gen_docbook_content_with_struct(struct)
197
+ else
198
+ gen_docbook_content(content)
199
+ end
193
200
 
194
201
  <<-EOS
195
202
  <?xml version="1.0" encoding="utf-8"?>
@@ -208,6 +215,8 @@ class TxtBook
208
215
  EOS
209
216
  end
210
217
 
218
+
219
+
211
220
  def build_struct(content)
212
221
  stack = Array.new(8)
213
222
  struct = []
@@ -314,18 +323,20 @@ EOS
314
323
  end
315
324
 
316
325
  def extract_toc_from_struct(struct)
317
- toc = []
318
- struct.each do |item|
319
- if item.is_a?(Hash)
320
- children = []
321
- if item[:children].any?
322
- children = extract_toc_from_struct(item[:children])
326
+ if struct
327
+ toc = []
328
+ struct.each do |item|
329
+ if item.is_a?(Hash)
330
+ children = []
331
+ if item[:children].any?
332
+ children = extract_toc_from_struct(item[:children])
333
+ end
334
+ item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
335
+ toc << item_hash
323
336
  end
324
- item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
325
- toc << item_hash
326
337
  end
338
+ toc
327
339
  end
328
- toc
329
340
  end
330
341
 
331
342
  def gen_docbook_toc(toc)
@@ -350,7 +361,16 @@ EOS
350
361
  end
351
362
  end
352
363
 
353
- def gen_docbook_content(struct)
364
+ def gen_docbook_content(content)
365
+ paras = extract_paras(content)
366
+ paras_content = paras.map do |para|
367
+ text = Utils.escape_html(Utils.clean_text(para))
368
+ "<para id='#{UUID.generate}'>#{text}</para>"
369
+ end.join("\n")
370
+ "<sect1>#{paras_content}</sect1>"
371
+ end
372
+
373
+ def gen_docbook_content_with_struct(struct)
354
374
  content = []
355
375
  struct.each do |item|
356
376
  if item.is_a?(Hash)
data/lib/utils.rb CHANGED
@@ -256,4 +256,19 @@ module Utils
256
256
  sections
257
257
  end
258
258
 
259
+ def extract_text_from_file(filename,format)
260
+ txt_file = File.basename(filename,format)
261
+ if !filename.include?("'")
262
+ cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
263
+ elsif !filename.include?('"')
264
+ cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
265
+ else
266
+ cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
267
+ end
268
+ output = `#{cmd}`
269
+ content = File.open("#{txt_file}.txt").read
270
+ FileUtils.remove_file("#{txt_file}.txt",true)
271
+ return content
272
+ end
273
+
259
274
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-05 00:00:00.000000000 Z
12
+ date: 2013-06-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: uuid
@@ -156,7 +156,7 @@ dependencies:
156
156
  - !ruby/object:Gem::Version
157
157
  version: '0'
158
158
  description: 电子书工具集.
159
- email: aaron@nonobo.com
159
+ email: yalong1976@gmail.com
160
160
  executables:
161
161
  - ebook_tools
162
162
  - para_import_mongo
@@ -171,15 +171,16 @@ files:
171
171
  - bin/doc_book_import_mongo
172
172
  - bin/para_import_scheduling
173
173
  - bin/xml2json
174
+ - lib/doc_book_in_mongo.rb
174
175
  - lib/ebook_tools.rb
175
- - lib/txt_book.rb
176
+ - lib/epub.rb
177
+ - lib/epub_book.rb
176
178
  - lib/header_detect.rb
179
+ - lib/paras_in_mongo.rb
177
180
  - lib/pdf.rb
178
181
  - lib/txt.rb
179
- - lib/epub.rb
182
+ - lib/txt_book.rb
180
183
  - lib/utils.rb
181
- - lib/paras_in_mongo.rb
182
- - lib/doc_book_in_mongo.rb
183
184
  - workers/para_import_worker.rb
184
185
  - ebook_tools.gemspec
185
186
  homepage: