ebook_tools 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 0.1.0 2013.4.10
2
+ refactor struct extract
3
+
1
4
  0.0.6 2013.4.10
2
5
  fix bug: not open file on batch_convert
3
6
  fix bug: epub file can't be convert
data/ebook_tools.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.0.6'
5
+ s.version = '0.1.0'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
25
25
  "bin/doc_book_import_mongo",
26
26
  "bin/xml2json",
27
27
  "lib/ebook_tools.rb",
28
- "lib/extract_book_struct.rb",
28
+ "lib/txt_book.rb",
29
29
  "lib/header_detect.rb",
30
30
  "lib/pdf.rb",
31
31
  "lib/txt.rb",
data/lib/ebook_tools.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # encoding: UTF-8
3
- ['utils','epub','txt','pdf','header_detect','extract_book_struct'].each do |file|
3
+ ['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
4
4
  require File.join(File.dirname(__FILE__),file)
5
5
  end
6
6
 
@@ -201,16 +201,27 @@ module EbookTools
201
201
  end
202
202
 
203
203
  def extract_book_struct_to_file(source,destination,options={})
204
- method_name = "from_#{File.extname(source).gsub('.','')}"
205
- if ExtractBookStruct.respond_to?(method_name)
206
- docbook_xml = ExtractBookStruct.send(method_name,source,options)
207
- if docbook_xml
208
- FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
209
- File.open(destination,'wb'){|file|file.write docbook_xml}
210
- return true
211
- else
212
- return nil
213
- end
204
+ options[:title] ||= File.basename(source,File.extname(source))
205
+ content = case File.extname(source)
206
+ when '.html'
207
+ extract_text_from_file(source,'.html')
208
+ when '.epub'
209
+ text = extract_text_from_file(source,'.epub')
210
+ sanitize_for_epub_text(text)
211
+ when '.txt'
212
+ File.open(source).read
213
+ end
214
+ txt_book = TxtBook.new(content,options)
215
+ docbook_xml = txt_book.to_doc_book
216
+ if docbook_xml
217
+ FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
218
+ File.open(destination,'wb'){|file|file.write docbook_xml}
219
+ puts "目录结构:"
220
+ puts txt_book.toc_to_text
221
+ puts "共修复#{txt_book.breaklines_count}个断点."
222
+ return true
223
+ else
224
+ return nil
214
225
  end
215
226
  end
216
227
 
@@ -254,4 +265,27 @@ module EbookTools
254
265
  content = Utils.fixed_page_break(content,options)
255
266
  File.open(target_file,'w'){|file| file.write content}
256
267
  end
268
+
269
+ def extract_text_from_file(filename,format)
270
+ txt_file = File.basename(filename,format)
271
+ cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
272
+ output = `#{cmd}`
273
+ content = File.open("#{txt_file}.txt").read
274
+ FileUtils.remove_file("#{txt_file}.txt",true)
275
+ return content
276
+ end
277
+
278
+ # sanitize_for_epub_text
279
+ def sanitize_for_epub_text(content)
280
+ return content if content.blank?
281
+ lines = []
282
+ content.each_line do |line|
283
+ unless line.downcase.include?('document outline')
284
+ lines << line
285
+ else
286
+ break;
287
+ end
288
+ end
289
+ lines.join("")
290
+ end
257
291
  end
@@ -1,7 +1,10 @@
1
- # encoding: UTF-8
2
- # =ExtractBookStruct
3
- # ExtractBookStruct的目的是提取书的结构信息。
4
- # ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
1
+ # encoding: utf-8
2
+ require 'uuid'
3
+ require 'cgi'
4
+
5
+ #=文本书籍
6
+ # 处理TXT格式的书籍。
7
+ #
5
8
  # 1. 文档的编码格式必须是UTF-8或GB2312,推荐使用UTF-8格式
6
9
  # 2. 文档的内容只包含书内容部分(书名、作者、目录等信息应该不包含在文档内)
7
10
  # 3. 文档的段落应该完整(有些PDF转换过来的文档会破坏句子,需要进行预处理)
@@ -9,145 +12,83 @@
9
12
  # 5. 文档需要包含结构信息(例如: 卷、篇、部分、章(回)节或者有连续的序号)
10
13
  # 6. 每个结构信息都应该独立成行。
11
14
  #
12
- #
13
- # ==接口
14
- #
15
- # === ExtractBookStruct.from_txt
16
- # 从文本文件中提取目录结构
17
- #
18
- # === ExtractBookStruct.from_epub
19
- # 从EPUB文件中提取目录结构
20
- #
21
- # === ExtractBookStruct.from_html
22
- # 从HTML中提取目录结构
15
+ class TxtBook
16
+ include HeaderDetect
17
+ attr_reader :title,:author,:publisher,:pubdate,:isbn,:content
23
18
 
24
- require 'uuid'
25
- require 'cgi'
26
- require 'iconv'
27
-
28
- module ExtractBookStruct
29
- extend self
30
- extend HeaderDetect
31
-
32
- def from_txt(filename,options={})
33
- options[:title] ||= File.basename(filename,File.extname(filename))
19
+ def self.load(filename,options={})
20
+ raise '无效的文件' unless File.exists?(filename)
21
+ options[:title] = File.basename(filename, File.extname(filename))
34
22
  content = File.open(filename).read
35
- extract_book_struct(content,options)
36
- end
37
-
38
- def from_html(filename,options={})
39
- options[:title] ||= File.basename(filename,File.extname(filename))
40
- content = extract_text_from_file(filename,'.html')
41
- extract_book_struct(content,options)
42
- end
43
-
44
- def from_epub(filename,options={})
45
- options[:title] ||= File.basename(filename,File.extname(filename))
46
- content = extract_text_from_file(filename,'.epub')
47
- extract_book_struct(content,options)
23
+ new(content,options)
48
24
  end
25
+
26
+ def initialize(content,options={})
27
+ @title = options[:title]
28
+ @author = options[:author]
29
+ @publisher = options[:publisher]
30
+ @pubdate= options[:pubdate]
31
+ @isbn = options[:isbn]
32
+ @format = options[:format]
49
33
 
50
- def extract_book_struct(content,options={})
51
34
  unless Utils.detect_utf8(content)
52
35
  content = Utils.to_utf8(content)
53
36
  end
54
- content = sanitize_for_epub_text(content)
55
- paras = extract_paras(content)
56
- # 检查书类型(text,digital,hybrid)
57
- format = options[:format] || detect_struct_type(paras)
58
- case format
59
- when :text
60
- extract_text_book_struct(paras,options)
61
- when :digital
62
- extract_digital_book_struct(paras,options)
63
- when :hybrid
64
- extract_hybrid_book_struct(paras,options)
65
- else
66
- return nil
67
- end
37
+ @content = content
68
38
  end
69
39
 
70
- def extract_text_from_file(filename,format)
71
- txt_file = File.basename(filename,format)
72
- cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
73
- output = `#{cmd}`
74
- content = File.open("#{txt_file}.txt").read
75
- FileUtils.remove_file("#{txt_file}.txt",true)
76
- return content
40
+ def struct_content
41
+ return @struct_content if @struct_content
42
+ content = if breaklines_count > 100
43
+ Utils.fixed_page_break(@content)
44
+ else
45
+ @content
46
+ end
47
+ @struct_content = extract_book_struct(content,:format=>@format)
77
48
  end
78
49
 
79
- def extract_paras(content)
80
- paras = []
81
- return paras if content.blank?
82
- content.each_line do |line|
83
- text = Utils.clean_text(line)
84
- paras << text if text.length > 0
85
- end
86
- paras
50
+ def breaklines
51
+ @breaklines ||= Utils.breaklines(content)
87
52
  end
88
53
 
89
- def detect_struct_type(paras)
90
- text_flag = false
91
- digital_flag = false
92
- paras.each do |para|
93
- if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
94
- text_flag = true
95
- end
96
-
97
- if guess_digital_header?(para)
98
- digital_flag = true
99
- end
100
- end
101
-
102
- if text_flag && digital_flag
103
- :hybrid
104
- elsif text_flag
105
- :text
106
- elsif digital_flag
107
- :digital
108
- else
109
- :unknown
110
- end
54
+ def breaklines_count
55
+ breaklines.count
111
56
  end
112
57
 
113
- # 从text类型书中提取结构
114
- def extract_text_book_struct(content,options={})
115
- # 标注结构信息
116
- marked_content = mark_struct_info(content)
117
- # 构建书结构
118
- struct = build_struct(marked_content)
119
- # 修正结构
120
- revised_struct = revise_struct(struct)
121
- # 生成docbook
122
- build_doc_book(revised_struct,options)
58
+ def toc
59
+ @toc ||= extract_toc_from_struct(struct_content)
123
60
  end
124
61
 
125
- # 从数字类型书中提取结构
126
- def extract_digital_book_struct(content,options={})
127
- marked_content = mark_digital_struct_info(content)
128
-
129
- # 构建书结构
130
- struct = build_struct(marked_content)
131
-
132
- # 修正结构
133
- revised_struct = revise_struct(struct)
134
-
135
- # 生成docbook
136
- build_doc_book(revised_struct,options)
62
+ def toc_to_text
63
+ gen_toc(toc) do |item,children|
64
+ "#{item[:title]}\n#{children}"
65
+ end
137
66
  end
138
67
 
139
- # 从混合类型书中提取结构
140
- def extract_hybrid_book_struct(content,options={})
141
- marked_content = mark_hybrid_struct_info(content)
142
-
143
- # 构建书结构
144
- struct = build_struct(marked_content)
145
-
146
- # 修正结构
147
- revised_struct = revise_struct(struct)
68
+ def to_doc_book
69
+ if struct_content
70
+ build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
71
+ end
72
+ end
148
73
 
149
- # 生成docbook
150
- build_doc_book(revised_struct,options)
74
+ private
75
+ def extract_book_struct(content,options={})
76
+ paras = extract_paras(content)
77
+ # 检查书类型(text,digital,hybrid)
78
+ format = options[:format] || detect_struct_type(paras)
79
+ marked_content = case format
80
+ when :text
81
+ mark_struct_info(paras)
82
+ when :digital
83
+ mark_digital_struct_info(paras)
84
+ when :hybrid
85
+ mark_hybrid_struct_info(paras)
86
+ else
87
+ return nil
88
+ end
89
+ if marked_content
90
+ build_struct(marked_content)
91
+ end
151
92
  end
152
93
 
153
94
  # 标注结构信息
@@ -207,14 +148,7 @@ module ExtractBookStruct
207
148
  marked_content
208
149
  end
209
150
 
210
- # 修正结构 TODO
211
- def revise_struct(struct)
212
- struct
213
- end
214
-
215
151
  def build_doc_book(struct,options={})
216
- toc = extract_toc_from_struct(struct)
217
-
218
152
  doc_toc = gen_docbook_toc(toc)
219
153
 
220
154
  struct = struct.map{|item| item if item.is_a?(Hash)}.compact
@@ -380,20 +314,6 @@ EOS
380
314
  end
381
315
  end
382
316
 
383
- =begin
384
- def gen_docbook_tocdiv(toc)
385
- doc_toc = []
386
- toc.each do |item|
387
- children = ""
388
- if item[:children].any?
389
- children = gen_docbook_tocdiv(item[:children])
390
- end
391
- doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
392
- end
393
- doc_toc.join("")
394
- end
395
- =end
396
-
397
317
  def gen_docbook_content(struct)
398
318
  content = []
399
319
  struct.each do |item|
@@ -420,17 +340,37 @@ EOS
420
340
  content.join("\n")
421
341
  end
422
342
 
423
- # sanitize_for_epub_text
424
- def sanitize_for_epub_text(content)
425
- return content if content.blank?
426
- lines = []
427
- content.each_line do |line|
428
- unless line.downcase.include?('document outline')
429
- lines << line
430
- else
431
- break;
343
+ def detect_struct_type(paras)
344
+ text_flag = false
345
+ digital_flag = false
346
+ paras.each do |para|
347
+ if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
348
+ text_flag = true
432
349
  end
350
+
351
+ if guess_digital_header?(para)
352
+ digital_flag = true
353
+ end
354
+ end
355
+
356
+ if text_flag && digital_flag
357
+ :hybrid
358
+ elsif text_flag
359
+ :text
360
+ elsif digital_flag
361
+ :digital
362
+ else
363
+ :unknown
433
364
  end
434
- lines.join("")
365
+ end
366
+
367
+ def extract_paras(content)
368
+ paras = []
369
+ return paras if content.blank?
370
+ content.each_line do |line|
371
+ text = Utils.clean_text(line)
372
+ paras << text if text.length > 0
373
+ end
374
+ paras
435
375
  end
436
376
  end
data/lib/utils.rb CHANGED
@@ -23,15 +23,8 @@ module Utils
23
23
  # parameters:
24
24
  # +page_text+ 文本内容
25
25
  def fixed_page_break(page_text,options={})
26
- page_lines = []
27
26
  length = options[:length] || guess_content_line_length(page_text)
28
-
29
- page_text.each_line do |line|
30
- line.gsub!("\r\n","")
31
- line.gsub!("\n","")
32
- line.strip!
33
- page_lines << line
34
- end
27
+ page_lines = text_to_array(page_text)
35
28
 
36
29
  lines = []
37
30
  flag_tag = false
@@ -52,6 +45,32 @@ module Utils
52
45
  lines.join("\n")
53
46
  end
54
47
 
48
+ def breaklines(text,options={})
49
+ break_lines = []
50
+ lines = text_to_array(text)
51
+ length = options[:length] || guess_content_line_length(text)
52
+ lines.each do |line|
53
+ if line.length > 0
54
+ unless line_closed?(line,length)
55
+ break_lines << line
56
+ end
57
+ end
58
+ end
59
+ break_lines
60
+ end
61
+
62
+
63
+ def text_to_array(text)
64
+ page_lines = []
65
+ text.each_line do |line|
66
+ line.gsub!("\r\n","")
67
+ line.gsub!("\n","")
68
+ line.strip!
69
+ page_lines << line
70
+ end
71
+ page_lines
72
+ end
73
+
55
74
  # 计算文本相似度
56
75
  def text_similarity(text1,text2)
57
76
  return 0 if text1.blank? || text2.blank?
@@ -101,6 +120,7 @@ module Utils
101
120
  content.each_line{|line|
102
121
  lengths << line.length
103
122
  }
123
+ lengths.sort!
104
124
  while true
105
125
  line_length = lengths.pop
106
126
  break if line_length < 80
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -155,7 +155,7 @@ files:
155
155
  - bin/doc_book_import_mongo
156
156
  - bin/xml2json
157
157
  - lib/ebook_tools.rb
158
- - lib/extract_book_struct.rb
158
+ - lib/txt_book.rb
159
159
  - lib/header_detect.rb
160
160
  - lib/pdf.rb
161
161
  - lib/txt.rb