ebook_tools 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 0.1.0 2013.4.10
2
+ refactor struct extract
3
+
1
4
  0.0.6 2013.4.10
2
5
  fix bug: not open file on batch_convert
3
6
  fix bug: epub file can't be convert
data/ebook_tools.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.0.6'
5
+ s.version = '0.1.0'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
25
25
  "bin/doc_book_import_mongo",
26
26
  "bin/xml2json",
27
27
  "lib/ebook_tools.rb",
28
- "lib/extract_book_struct.rb",
28
+ "lib/txt_book.rb",
29
29
  "lib/header_detect.rb",
30
30
  "lib/pdf.rb",
31
31
  "lib/txt.rb",
data/lib/ebook_tools.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # encoding: UTF-8
3
- ['utils','epub','txt','pdf','header_detect','extract_book_struct'].each do |file|
3
+ ['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
4
4
  require File.join(File.dirname(__FILE__),file)
5
5
  end
6
6
 
@@ -201,16 +201,27 @@ module EbookTools
201
201
  end
202
202
 
203
203
  def extract_book_struct_to_file(source,destination,options={})
204
- method_name = "from_#{File.extname(source).gsub('.','')}"
205
- if ExtractBookStruct.respond_to?(method_name)
206
- docbook_xml = ExtractBookStruct.send(method_name,source,options)
207
- if docbook_xml
208
- FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
209
- File.open(destination,'wb'){|file|file.write docbook_xml}
210
- return true
211
- else
212
- return nil
213
- end
204
+ options[:title] ||= File.basename(source,File.extname(source))
205
+ content = case File.extname(source)
206
+ when '.html'
207
+ extract_text_from_file(source,'.html')
208
+ when '.epub'
209
+ text = extract_text_from_file(source,'.epub')
210
+ sanitize_for_epub_text(text)
211
+ when '.txt'
212
+ File.open(source).read
213
+ end
214
+ txt_book = TxtBook.new(content,options)
215
+ docbook_xml = txt_book.to_doc_book
216
+ if docbook_xml
217
+ FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
218
+ File.open(destination,'wb'){|file|file.write docbook_xml}
219
+ puts "目录结构:"
220
+ puts txt_book.toc_to_text
221
+ puts "共修复#{txt_book.breaklines_count}个断点."
222
+ return true
223
+ else
224
+ return nil
214
225
  end
215
226
  end
216
227
 
@@ -254,4 +265,27 @@ module EbookTools
254
265
  content = Utils.fixed_page_break(content,options)
255
266
  File.open(target_file,'w'){|file| file.write content}
256
267
  end
268
+
269
+ def extract_text_from_file(filename,format)
270
+ txt_file = File.basename(filename,format)
271
+ cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
272
+ output = `#{cmd}`
273
+ content = File.open("#{txt_file}.txt").read
274
+ FileUtils.remove_file("#{txt_file}.txt",true)
275
+ return content
276
+ end
277
+
278
+ # sanitize_for_epub_text
279
+ def sanitize_for_epub_text(content)
280
+ return content if content.blank?
281
+ lines = []
282
+ content.each_line do |line|
283
+ unless line.downcase.include?('document outline')
284
+ lines << line
285
+ else
286
+ break;
287
+ end
288
+ end
289
+ lines.join("")
290
+ end
257
291
  end
@@ -1,7 +1,10 @@
1
- # encoding: UTF-8
2
- # =ExtractBookStruct
3
- # ExtractBookStruct的目的是提取书的结构信息。
4
- # ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
1
+ # encoding: utf-8
2
+ require 'uuid'
3
+ require 'cgi'
4
+
5
+ #=文本书籍
6
+ # 处理TXT格式的书籍。
7
+ #
5
8
  # 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
6
9
  # 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
7
10
  # 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
@@ -9,145 +12,83 @@
9
12
  # 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
10
13
  # 6. 每个结构信息都应该独立成行。
11
14
  #
12
- #
13
- # ==接口
14
- #
15
- # === ExtractBookStruct.from_txt
16
- # 从文本文件中提取目录结构
17
- #
18
- # === ExtractBookStruct.from_epub
19
- # 从EPUB文件中提取目录结构
20
- #
21
- # === ExtractBookStruct.from_html
22
- # 从HTML中提取目录结构
15
+ class TxtBook
16
+ include HeaderDetect
17
+ attr_reader :title,:author,:publisher,:pubdate,:isbn,:content
23
18
 
24
- require 'uuid'
25
- require 'cgi'
26
- require 'iconv'
27
-
28
- module ExtractBookStruct
29
- extend self
30
- extend HeaderDetect
31
-
32
- def from_txt(filename,options={})
33
- options[:title] ||= File.basename(filename,File.extname(filename))
19
+ def self.load(filename,options={})
20
+ raise '无效的文件' unless File.exists?(filename)
21
+ options[:title] = File.basename(filename, File.extname(filename))
34
22
  content = File.open(filename).read
35
- extract_book_struct(content,options)
36
- end
37
-
38
- def from_html(filename,options={})
39
- options[:title] ||= File.basename(filename,File.extname(filename))
40
- content = extract_text_from_file(filename,'.html')
41
- extract_book_struct(content,options)
42
- end
43
-
44
- def from_epub(filename,options={})
45
- options[:title] ||= File.basename(filename,File.extname(filename))
46
- content = extract_text_from_file(filename,'.epub')
47
- extract_book_struct(content,options)
23
+ new(content,options)
48
24
  end
25
+
26
+ def initialize(content,options={})
27
+ @title = options[:title]
28
+ @author = options[:author]
29
+ @publisher = options[:publisher]
30
+ @pubdate= options[:pubdate]
31
+ @isbn = options[:isbn]
32
+ @format = options[:format]
49
33
 
50
- def extract_book_struct(content,options={})
51
34
  unless Utils.detect_utf8(content)
52
35
  content = Utils.to_utf8(content)
53
36
  end
54
- content = sanitize_for_epub_text(content)
55
- paras = extract_paras(content)
56
- # 检查书类型(text,digital,hybrid)
57
- format = options[:format] || detect_struct_type(paras)
58
- case format
59
- when :text
60
- extract_text_book_struct(paras,options)
61
- when :digital
62
- extract_digital_book_struct(paras,options)
63
- when :hybrid
64
- extract_hybrid_book_struct(paras,options)
65
- else
66
- return nil
67
- end
37
+ @content = content
68
38
  end
69
39
 
70
- def extract_text_from_file(filename,format)
71
- txt_file = File.basename(filename,format)
72
- cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
73
- output = `#{cmd}`
74
- content = File.open("#{txt_file}.txt").read
75
- FileUtils.remove_file("#{txt_file}.txt",true)
76
- return content
40
+ def struct_content
41
+ return @struct_content if @struct_content
42
+ content = if breaklines_count > 100
43
+ Utils.fixed_page_break(@content)
44
+ else
45
+ @content
46
+ end
47
+ @struct_content = extract_book_struct(content,:format=>@format)
77
48
  end
78
49
 
79
- def extract_paras(content)
80
- paras = []
81
- return paras if content.blank?
82
- content.each_line do |line|
83
- text = Utils.clean_text(line)
84
- paras << text if text.length > 0
85
- end
86
- paras
50
+ def breaklines
51
+ @breaklines ||= Utils.breaklines(content)
87
52
  end
88
53
 
89
- def detect_struct_type(paras)
90
- text_flag = false
91
- digital_flag = false
92
- paras.each do |para|
93
- if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
94
- text_flag = true
95
- end
96
-
97
- if guess_digital_header?(para)
98
- digital_flag = true
99
- end
100
- end
101
-
102
- if text_flag && digital_flag
103
- :hybrid
104
- elsif text_flag
105
- :text
106
- elsif digital_flag
107
- :digital
108
- else
109
- :unknown
110
- end
54
+ def breaklines_count
55
+ breaklines.count
111
56
  end
112
57
 
113
- # 从text类型书中提取结构
114
- def extract_text_book_struct(content,options={})
115
- # 标注结构信息
116
- marked_content = mark_struct_info(content)
117
- # 构建书结构
118
- struct = build_struct(marked_content)
119
- # 修正结构
120
- revised_struct = revise_struct(struct)
121
- # 生成docbook
122
- build_doc_book(revised_struct,options)
58
+ def toc
59
+ @toc ||= extract_toc_from_struct(struct_content)
123
60
  end
124
61
 
125
- # 从数字类型书中提取结构
126
- def extract_digital_book_struct(content,options={})
127
- marked_content = mark_digital_struct_info(content)
128
-
129
- # 构建书结构
130
- struct = build_struct(marked_content)
131
-
132
- # 修正结构
133
- revised_struct = revise_struct(struct)
134
-
135
- # 生成docbook
136
- build_doc_book(revised_struct,options)
62
+ def toc_to_text
63
+ gen_toc(toc) do |item,children|
64
+ "#{item[:title]}\n#{children}"
65
+ end
137
66
  end
138
67
 
139
- # 从混合类型书中提取结构
140
- def extract_hybrid_book_struct(content,options={})
141
- marked_content = mark_hybrid_struct_info(content)
142
-
143
- # 构建书结构
144
- struct = build_struct(marked_content)
145
-
146
- # 修正结构
147
- revised_struct = revise_struct(struct)
68
+ def to_doc_book
69
+ if struct_content
70
+ build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
71
+ end
72
+ end
148
73
 
149
- # 生成docbook
150
- build_doc_book(revised_struct,options)
74
+ private
75
+ def extract_book_struct(content,options={})
76
+ paras = extract_paras(content)
77
+ # 检查书类型(text,digital,hybrid)
78
+ format = options[:format] || detect_struct_type(paras)
79
+ marked_content = case format
80
+ when :text
81
+ mark_struct_info(paras)
82
+ when :digital
83
+ mark_digital_struct_info(paras)
84
+ when :hybrid
85
+ mark_hybrid_struct_info(paras)
86
+ else
87
+ return nil
88
+ end
89
+ if marked_content
90
+ build_struct(marked_content)
91
+ end
151
92
  end
152
93
 
153
94
  # 标注结构信息
@@ -207,14 +148,7 @@ module ExtractBookStruct
207
148
  marked_content
208
149
  end
209
150
 
210
- # 修正结构 TODO
211
- def revise_struct(struct)
212
- struct
213
- end
214
-
215
151
  def build_doc_book(struct,options={})
216
- toc = extract_toc_from_struct(struct)
217
-
218
152
  doc_toc = gen_docbook_toc(toc)
219
153
 
220
154
  struct = struct.map{|item| item if item.is_a?(Hash)}.compact
@@ -380,20 +314,6 @@ EOS
380
314
  end
381
315
  end
382
316
 
383
- =begin
384
- def gen_docbook_tocdiv(toc)
385
- doc_toc = []
386
- toc.each do |item|
387
- children = ""
388
- if item[:children].any?
389
- children = gen_docbook_tocdiv(item[:children])
390
- end
391
- doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
392
- end
393
- doc_toc.join("")
394
- end
395
- =end
396
-
397
317
  def gen_docbook_content(struct)
398
318
  content = []
399
319
  struct.each do |item|
@@ -420,17 +340,37 @@ EOS
420
340
  content.join("\n")
421
341
  end
422
342
 
423
- # sanitize_for_epub_text
424
- def sanitize_for_epub_text(content)
425
- return content if content.blank?
426
- lines = []
427
- content.each_line do |line|
428
- unless line.downcase.include?('document outline')
429
- lines << line
430
- else
431
- break;
343
+ def detect_struct_type(paras)
344
+ text_flag = false
345
+ digital_flag = false
346
+ paras.each do |para|
347
+ if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
348
+ text_flag = true
432
349
  end
350
+
351
+ if guess_digital_header?(para)
352
+ digital_flag = true
353
+ end
354
+ end
355
+
356
+ if text_flag && digital_flag
357
+ :hybrid
358
+ elsif text_flag
359
+ :text
360
+ elsif digital_flag
361
+ :digital
362
+ else
363
+ :unknown
433
364
  end
434
- lines.join("")
365
+ end
366
+
367
+ def extract_paras(content)
368
+ paras = []
369
+ return paras if content.blank?
370
+ content.each_line do |line|
371
+ text = Utils.clean_text(line)
372
+ paras << text if text.length > 0
373
+ end
374
+ paras
435
375
  end
436
376
  end
data/lib/utils.rb CHANGED
@@ -23,15 +23,8 @@ module Utils
23
23
  # parameters:
24
24
  # +page_text+ 文本内容
25
25
  def fixed_page_break(page_text,options={})
26
- page_lines = []
27
26
  length = options[:length] || guess_content_line_length(page_text)
28
-
29
- page_text.each_line do |line|
30
- line.gsub!("\r\n","")
31
- line.gsub!("\n","")
32
- line.strip!
33
- page_lines << line
34
- end
27
+ page_lines = text_to_array(page_text)
35
28
 
36
29
  lines = []
37
30
  flag_tag = false
@@ -52,6 +45,32 @@ module Utils
52
45
  lines.join("\n")
53
46
  end
54
47
 
48
+ def breaklines(text,options={})
49
+ break_lines = []
50
+ lines = text_to_array(text)
51
+ length = options[:length] || guess_content_line_length(text)
52
+ lines.each do |line|
53
+ if line.length > 0
54
+ unless line_closed?(line,length)
55
+ break_lines << line
56
+ end
57
+ end
58
+ end
59
+ break_lines
60
+ end
61
+
62
+
63
+ def text_to_array(text)
64
+ page_lines = []
65
+ text.each_line do |line|
66
+ line.gsub!("\r\n","")
67
+ line.gsub!("\n","")
68
+ line.strip!
69
+ page_lines << line
70
+ end
71
+ page_lines
72
+ end
73
+
55
74
  # 计算文本相似度
56
75
  def text_similarity(text1,text2)
57
76
  return 0 if text1.blank? || text2.blank?
@@ -101,6 +120,7 @@ module Utils
101
120
  content.each_line{|line|
102
121
  lengths << line.length
103
122
  }
123
+ lengths.sort!
104
124
  while true
105
125
  line_length = lengths.pop
106
126
  break if line_length < 80
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -155,7 +155,7 @@ files:
155
155
  - bin/doc_book_import_mongo
156
156
  - bin/xml2json
157
157
  - lib/ebook_tools.rb
158
- - lib/extract_book_struct.rb
158
+ - lib/txt_book.rb
159
159
  - lib/header_detect.rb
160
160
  - lib/pdf.rb
161
161
  - lib/txt.rb