ebook_tools 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ebook_tools.gemspec CHANGED
@@ -2,13 +2,13 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.1.4'
5
+ s.version = '0.1.5'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
9
- s.date = %q{2013-04-05}
9
+ s.date = %q{2013-06-04}
10
10
  s.description = %q{电子书工具集.}
11
- s.email = %q{aaron@nonobo.com}
11
+ s.email = %q{yalong1976@gmail.com}
12
12
  s.require_paths = ["lib"]
13
13
  s.requirements = ["none"]
14
14
  s.summary = %q{电子书工具集.}
@@ -25,15 +25,16 @@ Gem::Specification.new do |s|
25
25
  "bin/doc_book_import_mongo",
26
26
  "bin/para_import_scheduling",
27
27
  "bin/xml2json",
28
+ "lib/doc_book_in_mongo.rb",
28
29
  "lib/ebook_tools.rb",
29
- "lib/txt_book.rb",
30
- "lib/header_detect.rb",
31
- "lib/pdf.rb",
32
- "lib/txt.rb",
33
30
  "lib/epub.rb",
31
+ "lib/epub_book.rb",
32
+ "lib/header_detect.rb",
33
+ "lib/paras_in_mongo.rb",
34
+ "lib/pdf.rb",
35
+ "lib/txt.rb",
36
+ "lib/txt_book.rb",
34
37
  "lib/utils.rb",
35
- "lib/paras_in_mongo.rb",
36
- "lib/doc_book_in_mongo.rb",
37
38
  "workers/para_import_worker.rb",
38
39
  "ebook_tools.gemspec"
39
40
  ]
data/lib/ebook_tools.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # encoding: UTF-8
3
- ['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
3
+ ['utils','epub','txt','pdf','header_detect','txt_book','epub_book'].each do |file|
4
4
  require File.join(File.dirname(__FILE__),file)
5
5
  end
6
6
 
@@ -202,27 +202,34 @@ module EbookTools
202
202
 
203
203
  def extract_book_struct_to_file(source,destination,options={})
204
204
  options[:title] ||= File.basename(source,File.extname(source))
205
- content = case File.extname(source)
206
- when '.html'
207
- extract_text_from_file(source,'.html')
208
- when '.epub'
209
- text = extract_text_from_file(source,'.epub')
210
- sanitize_for_epub_text(text)
211
- when '.txt'
212
- File.open(source).read
213
- end
214
- txt_book = TxtBook.new(content,options)
215
- docbook_xml = txt_book.to_doc_book
216
- if docbook_xml
217
- FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
218
- File.open(destination,'wb'){|file|file.write docbook_xml}
219
- puts "目录结构:"
220
- puts txt_book.toc_to_text
221
- puts "共修复#{txt_book.breaklines_count}个断点."
222
- return true
205
+ if File.extname(source) == '.epub'
206
+ epub_book = EpubBook.new(source,options)
207
+ docbook_xml = epub_book.to_doc_book
208
+ if docbook_xml
209
+ write_doc_book(destination,docbook_xml)
210
+ puts "目录结构:"
211
+ puts epub_book.toc_to_text
212
+ return true
213
+ end
223
214
  else
224
- return nil
215
+ content = case File.extname(source)
216
+ when '.html'
217
+ Utils.extract_text_from_file(source,'.html')
218
+ when '.txt'
219
+ File.open(source).read
220
+ end
221
+ txt_book = TxtBook.new(content,options)
222
+ docbook_xml = txt_book.to_doc_book
223
+ if docbook_xml
224
+ write_doc_book(destination,docbook_xml)
225
+ puts "目录结构:"
226
+ puts txt_book.toc_to_text
227
+ puts "共修复#{txt_book.breaklines_count}个断点."
228
+ return true
229
+ end
225
230
  end
231
+
232
+ return nil
226
233
  end
227
234
 
228
235
  # batch_extract_from_dir
@@ -262,6 +269,11 @@ module EbookTools
262
269
  end
263
270
  end
264
271
 
272
+ def write_doc_book(destination, docbook_xml)
273
+ FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
274
+ File.open(destination,'wb'){|file|file.write docbook_xml}
275
+ end
276
+
265
277
  # text_paras_repair
266
278
  # 对文本文件格式中的中断段落进行修复
267
279
  def text_paras_repair(source_file,target_file,options={})
@@ -271,21 +283,6 @@ module EbookTools
271
283
  File.open(target_file,'w'){|file| file.write content}
272
284
  end
273
285
 
274
- def extract_text_from_file(filename,format)
275
- txt_file = File.basename(filename,format)
276
- if !filename.include?("'")
277
- cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
278
- elsif !filename.include?('"')
279
- cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
280
- else
281
- cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
282
- end
283
- output = `#{cmd}`
284
- content = File.open("#{txt_file}.txt").read
285
- FileUtils.remove_file("#{txt_file}.txt",true)
286
- return content
287
- end
288
-
289
286
  # sanitize_for_epub_text
290
287
  def sanitize_for_epub_text(content)
291
288
  return content if content.blank?
data/lib/epub_book.rb ADDED
@@ -0,0 +1,164 @@
1
+ # encoding: utf-8
2
+ require 'uuid'
3
+ require 'cgi'
4
+
5
+ # epub_book
6
+ # 处理EPUB书的类。
7
+ class EpubBook
8
+ attr_reader :title,:author,:publisher,:pubdate,:isbn,:content,:outline
9
+
10
+ def initialize(filename,options={})
11
+ raise '无效的文件' unless File.exists?(filename)
12
+ @title = options[:title]
13
+ @author = options[:author]
14
+ @publisher = options[:publisher]
15
+ @pubdate= options[:pubdate]
16
+ @isbn = options[:isbn]
17
+
18
+ text = Utils.extract_text_from_file(filename,'.epub')
19
+ unless Utils.detect_utf8(text)
20
+ text = Utils.to_utf8(text)
21
+ end
22
+ text = preprocess_content(text)
23
+ @outline, @content = extract_for_epub_text(text)
24
+ end
25
+
26
+ def toc_to_text
27
+ outline
28
+ end
29
+
30
+ def to_doc_book
31
+ build_doc_book(@outline,@content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
32
+ end
33
+
34
+
35
+ private
36
+ def preprocess_content(content)
37
+ paras = extract_paras(content)
38
+ paras.join("\n")
39
+ end
40
+
41
+ def extract_paras(content)
42
+ paras = []
43
+ return paras if content.blank?
44
+ content.each_line do |line|
45
+ text = Utils.clean_text(line)
46
+ paras << text if text.length > 0
47
+ end
48
+ paras
49
+ end
50
+
51
+ def build_doc_book(outline,content,options={})
52
+ doc_toc = gen_docbook_toc(outline.split("\n"))
53
+
54
+ doc_content = gen_docbook_content(content)
55
+
56
+ <<-EOS
57
+ <?xml version="1.0" encoding="utf-8"?>
58
+ <book xmlns="http://docbook.org/ns/docbook" version="5.0" id="#{UUID.generate}">
59
+ <info>
60
+ <title>#{options[:title]}</title>
61
+ <authorgroup>
62
+ <author><personname>#{options[:author]}</personname></author>
63
+ </authorgroup>
64
+ <pubdate>#{options[:pubdate]}</pubdate>
65
+ <publisher><publishername>#{options[:publisher]}</publishername></publisher>
66
+ </info>
67
+ #{doc_toc}
68
+ #{doc_content}
69
+ </book>
70
+ EOS
71
+ end
72
+
73
+ def gen_docbook_toc(toc)
74
+ "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
75
+ end
76
+
77
+ def gen_docbook_tocdiv(toc)
78
+ toc.map do |item|
79
+ text = Utils.escape_html(Utils.clean_text(item))
80
+ "<tocdiv><title>#{item}</title></tocdiv>"
81
+ end.join("")
82
+ end
83
+
84
+ def gen_docbook_content(content)
85
+ paras = extract_paras(content)
86
+ paras_content = paras.map do |para|
87
+ text = Utils.escape_html(Utils.clean_text(para))
88
+ "<para id='#{UUID.generate}'>#{text}</para>"
89
+ end.join("\n")
90
+ "<sect1>#{paras_content}</sect1>"
91
+ end
92
+
93
+ # extract_for_epub_text
94
+ def extract_for_epub_text(content)
95
+ return content if content.blank?
96
+ if outline_type?(content)
97
+ extract_outline_with_content(content)
98
+ elsif toc_type?(content)
99
+ extract_toc_with_content(content)
100
+ else
101
+ ['',content]
102
+ end
103
+ end
104
+
105
+ def extract_outline_with_content(content)
106
+ outline_flag = false
107
+ lines = []
108
+ outline = []
109
+ content.each_line do |line|
110
+ if line.strip.downcase == 'document outline'
111
+ outline_flag = true
112
+ next
113
+ end
114
+ unless outline_flag
115
+ lines << line
116
+ else
117
+ outline << line;
118
+ end
119
+ end
120
+ [outline.join("") , lines.join("")]
121
+ end
122
+
123
+ def extract_toc_with_content(content)
124
+ toc = []
125
+ lines = []
126
+
127
+ paras = extract_paras(content)
128
+ index = paras.index('Content')
129
+ paras = paras[(index+1)..-1]
130
+
131
+ point = nil
132
+
133
+ paras.each_with_index do |para, index|
134
+ if toc.include?(para)
135
+ point = index
136
+ break
137
+ else
138
+ toc << para
139
+ end
140
+ end
141
+
142
+ lines = paras[(point+1)..-1]
143
+ [toc.join("\n") , lines.join("\n")]
144
+ rescue
145
+ ['', content]
146
+ end
147
+
148
+ def outline_type?(content)
149
+ content.each_line do |line|
150
+ return true if line.strip.downcase == 'document outline'
151
+ end
152
+ false
153
+ end
154
+
155
+ def toc_type?(content)
156
+ toc_flag = false
157
+ toc_flag1 = false
158
+ content.each_line do |line|
159
+ toc_flag = true if line.strip.downcase == '目录'
160
+ toc_flag1 = true if line.strip.downcase == 'content'
161
+ end
162
+ toc_flag && toc_flag1
163
+ end
164
+ end
@@ -2,6 +2,7 @@
2
2
  require 'moped'
3
3
  require 'nokogiri'
4
4
  require 'active_support'
5
+ require 'active_support/core_ext/hash'
5
6
 
6
7
  module ParasInMongo
7
8
  extend self
data/lib/txt_book.rb CHANGED
@@ -87,18 +87,22 @@ class TxtBook
87
87
  end
88
88
 
89
89
  def toc
90
- @toc ||= extract_toc_from_struct(struct_content)
90
+ @toc ||= extract_toc_from_struct(struct_content) if struct_content
91
91
  end
92
92
 
93
93
  def toc_to_text
94
- gen_toc(toc) do |item,children|
95
- "#{item[:title]}\n#{children}"
94
+ if toc
95
+ gen_toc(toc) do |item,children|
96
+ "#{item[:title]}\n#{children}"
97
+ end
96
98
  end
97
99
  end
98
100
 
99
101
  def to_doc_book
100
102
  if struct_content
101
103
  build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
104
+ else
105
+ build_doc_book(content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
102
106
  end
103
107
  end
104
108
 
@@ -184,12 +188,15 @@ class TxtBook
184
188
  marked_content
185
189
  end
186
190
 
187
- def build_doc_book(struct,options={})
188
- doc_toc = gen_docbook_toc(toc)
191
+ def build_doc_book(content,options={})
192
+ doc_toc = gen_docbook_toc(toc) if toc
189
193
 
190
- struct = struct.map{|item| item if item.is_a?(Hash)}.compact
191
-
192
- doc_content = gen_docbook_content(struct)
194
+ doc_content = if content.is_a?(Array)
195
+ struct = content.map{|item| item if item.is_a?(Hash)}.compact
196
+ gen_docbook_content_with_struct(struct)
197
+ else
198
+ gen_docbook_content(content)
199
+ end
193
200
 
194
201
  <<-EOS
195
202
  <?xml version="1.0" encoding="utf-8"?>
@@ -208,6 +215,8 @@ class TxtBook
208
215
  EOS
209
216
  end
210
217
 
218
+
219
+
211
220
  def build_struct(content)
212
221
  stack = Array.new(8)
213
222
  struct = []
@@ -314,18 +323,20 @@ EOS
314
323
  end
315
324
 
316
325
  def extract_toc_from_struct(struct)
317
- toc = []
318
- struct.each do |item|
319
- if item.is_a?(Hash)
320
- children = []
321
- if item[:children].any?
322
- children = extract_toc_from_struct(item[:children])
326
+ if struct
327
+ toc = []
328
+ struct.each do |item|
329
+ if item.is_a?(Hash)
330
+ children = []
331
+ if item[:children].any?
332
+ children = extract_toc_from_struct(item[:children])
333
+ end
334
+ item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
335
+ toc << item_hash
323
336
  end
324
- item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
325
- toc << item_hash
326
337
  end
338
+ toc
327
339
  end
328
- toc
329
340
  end
330
341
 
331
342
  def gen_docbook_toc(toc)
@@ -350,7 +361,16 @@ EOS
350
361
  end
351
362
  end
352
363
 
353
- def gen_docbook_content(struct)
364
+ def gen_docbook_content(content)
365
+ paras = extract_paras(content)
366
+ paras_content = paras.map do |para|
367
+ text = Utils.escape_html(Utils.clean_text(para))
368
+ "<para id='#{UUID.generate}'>#{text}</para>"
369
+ end.join("\n")
370
+ "<sect1>#{paras_content}</sect1>"
371
+ end
372
+
373
+ def gen_docbook_content_with_struct(struct)
354
374
  content = []
355
375
  struct.each do |item|
356
376
  if item.is_a?(Hash)
data/lib/utils.rb CHANGED
@@ -256,4 +256,19 @@ module Utils
256
256
  sections
257
257
  end
258
258
 
259
+ def extract_text_from_file(filename,format)
260
+ txt_file = File.basename(filename,format)
261
+ if !filename.include?("'")
262
+ cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
263
+ elsif !filename.include?('"')
264
+ cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
265
+ else
266
+ cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
267
+ end
268
+ output = `#{cmd}`
269
+ content = File.open("#{txt_file}.txt").read
270
+ FileUtils.remove_file("#{txt_file}.txt",true)
271
+ return content
272
+ end
273
+
259
274
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-05 00:00:00.000000000 Z
12
+ date: 2013-06-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: uuid
@@ -156,7 +156,7 @@ dependencies:
156
156
  - !ruby/object:Gem::Version
157
157
  version: '0'
158
158
  description: 电子书工具集.
159
- email: aaron@nonobo.com
159
+ email: yalong1976@gmail.com
160
160
  executables:
161
161
  - ebook_tools
162
162
  - para_import_mongo
@@ -171,15 +171,16 @@ files:
171
171
  - bin/doc_book_import_mongo
172
172
  - bin/para_import_scheduling
173
173
  - bin/xml2json
174
+ - lib/doc_book_in_mongo.rb
174
175
  - lib/ebook_tools.rb
175
- - lib/txt_book.rb
176
+ - lib/epub.rb
177
+ - lib/epub_book.rb
176
178
  - lib/header_detect.rb
179
+ - lib/paras_in_mongo.rb
177
180
  - lib/pdf.rb
178
181
  - lib/txt.rb
179
- - lib/epub.rb
182
+ - lib/txt_book.rb
180
183
  - lib/utils.rb
181
- - lib/paras_in_mongo.rb
182
- - lib/doc_book_in_mongo.rb
183
184
  - workers/para_import_worker.rb
184
185
  - ebook_tools.gemspec
185
186
  homepage: