RubyGems - ebook_tools - Versions diffs - 0.0.6 → 0.1.0 - Mend

ebook_tools 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/CHANGELOG +3 -0
data/ebook_tools.gemspec +2 -2
data/lib/ebook_tools.rb +45 -11
data/lib/{extract_book_struct.rb → txt_book.rb} +94 -154
data/lib/utils.rb +28 -8
metadata +2 -2

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,6 @@
+0.1.0 2013.4.10
+  refactor struct extract
 0.0.6 2013.4.10
   fix bug: not open file on batch_convert
   fix bug: epub file can't be convert

data/ebook_tools.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name = %q{ebook_tools}
-  s.version = '0.0.6'
+  s.version = '0.1.0'
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Aaron"]
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
     "bin/doc_book_import_mongo",
     "bin/xml2json",
     "lib/ebook_tools.rb",
-    "lib/extract_book_struct.rb",
+    "lib/txt_book.rb",
     "lib/header_detect.rb",
     "lib/pdf.rb",
     "lib/txt.rb",

data/lib/ebook_tools.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 # encoding: UTF-8
-['utils','epub','txt','pdf','header_detect','extract_book_struct'].each do |file|
+['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
   require File.join(File.dirname(__FILE__),file)
 end
@@ -201,16 +201,27 @@ module EbookTools
   end
   def extract_book_struct_to_file(source,destination,options={})
-    method_name = "from_#{File.extname(source).gsub('.','')}"
-    if ExtractBookStruct.respond_to?(method_name)
-      docbook_xml = ExtractBookStruct.send(method_name,source,options)
-      if docbook_xml
-        FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
-        File.open(destination,'wb'){|file|file.write docbook_xml}
-        return true
-      else
-        return nil
-      end
+    options[:title] ||= File.basename(source,File.extname(source))
+    content = case File.extname(source)
+      when '.html'
+        extract_text_from_file(source,'.html')
+      when '.epub'
+        text = extract_text_from_file(source,'.epub')
+        sanitize_for_epub_text(text)
+      when '.txt'
+        File.open(source).read
+    end
+    txt_book = TxtBook.new(content,options)
+    docbook_xml = txt_book.to_doc_book
+    if docbook_xml
+      FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
+      File.open(destination,'wb'){|file|file.write docbook_xml}
+      puts "目录结构:"
+      puts txt_book.toc_to_text
+      puts "共修复#{txt_book.breaklines_count}个断点."
+      return true
+    else
+      return nil
     end
   end
@@ -254,4 +265,27 @@ module EbookTools
     content = Utils.fixed_page_break(content,options)
     File.open(target_file,'w'){|file| file.write content}
   end
+  def extract_text_from_file(filename,format)
+    txt_file = File.basename(filename,format)
+    cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
+    output = `#{cmd}`
+    content = File.open("#{txt_file}.txt").read
+    FileUtils.remove_file("#{txt_file}.txt",true)
+    return content
+  end
+  # sanitize_for_epub_text
+  def sanitize_for_epub_text(content)
+    return content if content.blank?
+    lines = []
+    content.each_line do |line|
+      unless line.downcase.include?('document outline')
+        lines << line
+      else
+        break;
+      end
+    end
+    lines.join("")
+  end
 end

data/lib/{extract_book_struct.rb → txt_book.rb} RENAMED Viewed

@@ -1,7 +1,10 @@
-# encoding: UTF-8
-# =ExtractBookStruct
-# ExtractBookStruct的目的是提取书的结构信息。
-# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
+# encoding: utf-8
+require 'uuid'
+require 'cgi'
+#=文本书籍
+#  处理TXT格式的书籍。
+#
 #   1. 文档的编码格式必须是UTF-8或GB2312，推荐使用UTF-8格式
 #   2. 文档的内容只包含书内容部分（书名、作者、目录等信息应该不包含在文档内）
 #   3. 文档的段落应该完整（有些PDF转换过来的文档会破坏句子，需要进行预处理）
@@ -9,145 +12,83 @@
 #   5. 文档需要包含结构信息（例如： 卷、篇、部分、章（回）节或者有连续的序号）
 #   6. 每个结构信息都应该独立成行。
 #
-#
-# ==接口
-#
-# === ExtractBookStruct.from_txt
-#  从文本文件中提取目录结构
-#
-# === ExtractBookStruct.from_epub
-#  从EPUB文件中提取目录结构
-#
-# === ExtractBookStruct.from_html
-#  从HTML中提取目录结构
+class TxtBook
+  include HeaderDetect
+  attr_reader :title,:author,:publisher,:pubdate,:isbn,:content
-require 'uuid'
-require 'cgi'
-require 'iconv'
-module ExtractBookStruct
-  extend self
-  extend HeaderDetect
-  def from_txt(filename,options={})
-    options[:title] ||= File.basename(filename,File.extname(filename))
+  def self.load(filename,options={})
+    raise '无效的文件' unless File.exists?(filename)
+    options[:title] = File.basename(filename, File.extname(filename))
     content = File.open(filename).read
-    extract_book_struct(content,options)
-  end
-  def from_html(filename,options={})
-    options[:title] ||= File.basename(filename,File.extname(filename))
-    content = extract_text_from_file(filename,'.html')
-    extract_book_struct(content,options)
-  end
-  def from_epub(filename,options={})
-    options[:title] ||= File.basename(filename,File.extname(filename))
-    content = extract_text_from_file(filename,'.epub')
-    extract_book_struct(content,options)
+    new(content,options)
   end
+  def initialize(content,options={})
+    @title = options[:title]
+    @author = options[:author]
+    @publisher = options[:publisher]
+    @pubdate= options[:pubdate]
+    @isbn = options[:isbn]
+    @format = options[:format]
-  def extract_book_struct(content,options={})
     unless Utils.detect_utf8(content)
       content = Utils.to_utf8(content)
     end
-    content = sanitize_for_epub_text(content)
-    paras = extract_paras(content)
-    # 检查书类型（text,digital,hybrid)
-    format = options[:format] || detect_struct_type(paras)
-    case format
-    when :text
-      extract_text_book_struct(paras,options)
-    when :digital
-      extract_digital_book_struct(paras,options)
-    when :hybrid
-      extract_hybrid_book_struct(paras,options)
-    else
-      return nil
-    end
+    @content = content
   end
-  def extract_text_from_file(filename,format)
-    txt_file = File.basename(filename,format)
-    cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
-    output = `#{cmd}`
-    content = File.open("#{txt_file}.txt").read
-    FileUtils.remove_file("#{txt_file}.txt",true)
-    return content
+  def struct_content
+    return @struct_content if @struct_content
+    content = if breaklines_count > 100
+                Utils.fixed_page_break(@content)
+              else
+                @content
+              end
+    @struct_content = extract_book_struct(content,:format=>@format)
   end
-  def extract_paras(content)
-    paras = []
-    return paras if content.blank?
-    content.each_line do |line|
-      text = Utils.clean_text(line)
-      paras << text if text.length > 0
-    end
-    paras
+  def breaklines
+    @breaklines ||= Utils.breaklines(content)
   end
-  def detect_struct_type(paras)
-    text_flag = false
-    digital_flag = false
-    paras.each do |para|
-      if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
-        text_flag = true
-      end
-      if guess_digital_header?(para)
-        digital_flag = true
-      end
-    end
-    if text_flag && digital_flag
-      :hybrid
-    elsif text_flag
-      :text
-    elsif digital_flag
-      :digital
-    else
-      :unknown
-    end
+  def breaklines_count
+    breaklines.count
   end
-  # 从text类型书中提取结构
-  def extract_text_book_struct(content,options={})
-    # 标注结构信息
-    marked_content = mark_struct_info(content)
-    # 构建书结构
-    struct = build_struct(marked_content)
-    # 修正结构
-    revised_struct = revise_struct(struct)
-    # 生成docbook
-    build_doc_book(revised_struct,options)
+  def toc
+    @toc ||= extract_toc_from_struct(struct_content)
   end
-  # 从数字类型书中提取结构
-  def extract_digital_book_struct(content,options={})
-    marked_content = mark_digital_struct_info(content)
-    # 构建书结构
-    struct = build_struct(marked_content)
-    # 修正结构
-    revised_struct = revise_struct(struct)
-    # 生成docbook
-    build_doc_book(revised_struct,options)
+  def toc_to_text
+    gen_toc(toc) do |item,children|
+      "#{item[:title]}\n#{children}"
+    end
   end
-  # 从混合类型书中提取结构
-  def extract_hybrid_book_struct(content,options={})
-    marked_content = mark_hybrid_struct_info(content)
-    # 构建书结构
-    struct = build_struct(marked_content)
-    # 修正结构
-    revised_struct = revise_struct(struct)
+  def to_doc_book
+    if struct_content
+      build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
+    end
+  end
-    # 生成docbook
-    build_doc_book(revised_struct,options)
+  private
+  def extract_book_struct(content,options={})
+    paras = extract_paras(content)
+    # 检查书类型（text,digital,hybrid)
+    format = options[:format] || detect_struct_type(paras)
+    marked_content = case format
+    when :text
+       mark_struct_info(paras)
+    when :digital
+      mark_digital_struct_info(paras)
+    when :hybrid
+      mark_hybrid_struct_info(paras)
+    else
+      return nil
+    end
+    if marked_content
+      build_struct(marked_content)
+    end
   end
   # 标注结构信息
@@ -207,14 +148,7 @@ module ExtractBookStruct
     marked_content
   end
-  # 修正结构 TODO
-  def revise_struct(struct)
-    struct
-  end
   def build_doc_book(struct,options={})
-    toc = extract_toc_from_struct(struct)
     doc_toc = gen_docbook_toc(toc)
     struct = struct.map{|item| item if item.is_a?(Hash)}.compact
@@ -380,20 +314,6 @@ EOS
     end
   end
-=begin
-  def gen_docbook_tocdiv(toc)
-    doc_toc = []
-    toc.each do |item|
-      children = ""
-      if item[:children].any?
-        children = gen_docbook_tocdiv(item[:children])
-      end
-      doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
-    end
-    doc_toc.join("")
-  end
-=end
   def gen_docbook_content(struct)
     content = []
     struct.each do |item|
@@ -420,17 +340,37 @@ EOS
     content.join("\n")
   end
-  # sanitize_for_epub_text
-  def sanitize_for_epub_text(content)
-    return content if content.blank?
-    lines = []
-    content.each_line do |line|
-      unless line.downcase.include?('document outline')
-        lines << line
-      else
-        break;
+  def detect_struct_type(paras)
+    text_flag = false
+    digital_flag = false
+    paras.each do |para|
+      if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
+        text_flag = true
       end
+      if guess_digital_header?(para)
+        digital_flag = true
+      end
+    end
+    if text_flag && digital_flag
+      :hybrid
+    elsif text_flag
+      :text
+    elsif digital_flag
+      :digital
+    else
+      :unknown
     end
-    lines.join("")
+  end
+  def extract_paras(content)
+    paras = []
+    return paras if content.blank?
+    content.each_line do |line|
+      text = Utils.clean_text(line)
+      paras << text if text.length > 0
+    end
+    paras
   end
 end

data/lib/utils.rb CHANGED Viewed

@@ -23,15 +23,8 @@ module Utils
   # parameters:
   #   +page_text+   文本内容
   def fixed_page_break(page_text,options={})
-    page_lines = []
     length = options[:length] || guess_content_line_length(page_text)
-    page_text.each_line do |line|
-      line.gsub!("\r\n","")
-      line.gsub!("\n","")
-      line.strip!
-      page_lines << line
-    end
+    page_lines = text_to_array(page_text)
     lines = []
     flag_tag = false
@@ -52,6 +45,32 @@ module Utils
     lines.join("\n")
   end
+  def breaklines(text,options={})
+    break_lines = []
+    lines = text_to_array(text)
+    length = options[:length] || guess_content_line_length(text)
+    lines.each do |line|
+      if line.length > 0
+        unless line_closed?(line,length)
+          break_lines << line
+        end
+      end
+    end
+    break_lines
+  end
+  def text_to_array(text)
+    page_lines = []
+    text.each_line do |line|
+      line.gsub!("\r\n","")
+      line.gsub!("\n","")
+      line.strip!
+      page_lines << line
+    end
+    page_lines
+  end
   # 计算文本相似度
   def text_similarity(text1,text2)
     return 0 if text1.blank? || text2.blank?
@@ -101,6 +120,7 @@ module Utils
     content.each_line{|line|
       lengths << line.length
     }
+    lengths.sort!
     while true
       line_length = lengths.pop
       break if line_length < 80

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ebook_tools
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.1.0
   prerelease:
 platform: ruby
 authors:
@@ -155,7 +155,7 @@ files:
 - bin/doc_book_import_mongo
 - bin/xml2json
 - lib/ebook_tools.rb
-- lib/extract_book_struct.rb
+- lib/txt_book.rb
 - lib/header_detect.rb
 - lib/pdf.rb
 - lib/txt.rb