RubyGems - ebook_tools - Versions diffs - 0.1.4 → 0.1.5 - Mend

ebook_tools 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/ebook_tools.gemspec CHANGED Viewed

@@ -2,13 +2,13 @@
 Gem::Specification.new do |s|
   s.name = %q{ebook_tools}
-  s.version = '0.1.4'
+  s.version = '0.1.5'
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Aaron"]
-  s.date = %q{2013-04-05}
+  s.date = %q{2013-06-04}
   s.description = %q{电子书工具集.}
-  s.email = %q{aaron@nonobo.com}
+  s.email = %q{yalong1976@gmail.com}
   s.require_paths = ["lib"]
   s.requirements = ["none"]
   s.summary = %q{电子书工具集.}
@@ -25,15 +25,16 @@ Gem::Specification.new do |s|
     "bin/doc_book_import_mongo",
     "bin/para_import_scheduling",
     "bin/xml2json",
+    "lib/doc_book_in_mongo.rb",
     "lib/ebook_tools.rb",
-    "lib/txt_book.rb",
-    "lib/header_detect.rb",
-    "lib/pdf.rb",
-    "lib/txt.rb",
     "lib/epub.rb",
+    "lib/epub_book.rb",
+    "lib/header_detect.rb",
+    "lib/paras_in_mongo.rb",
+    "lib/pdf.rb",
+    "lib/txt.rb",
+    "lib/txt_book.rb",
     "lib/utils.rb",
-    "lib/paras_in_mongo.rb",
-    "lib/doc_book_in_mongo.rb",
     "workers/para_import_worker.rb",
     "ebook_tools.gemspec"
   ]

data/lib/ebook_tools.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 # encoding: UTF-8
-['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
+['utils','epub','txt','pdf','header_detect','txt_book','epub_book'].each do |file|
   require File.join(File.dirname(__FILE__),file)
 end
@@ -202,27 +202,34 @@ module EbookTools
   def extract_book_struct_to_file(source,destination,options={})
     options[:title] ||= File.basename(source,File.extname(source))
-    content = case File.extname(source)
-      when '.html'
-        extract_text_from_file(source,'.html')
-      when '.epub'
-        text = extract_text_from_file(source,'.epub')
-        sanitize_for_epub_text(text)
-      when '.txt'
-        File.open(source).read
-    end
-    txt_book = TxtBook.new(content,options)
-    docbook_xml = txt_book.to_doc_book
-    if docbook_xml
-      FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
-      File.open(destination,'wb'){|file|file.write docbook_xml}
-      puts "目录结构:"
-      puts txt_book.toc_to_text
-      puts "共修复#{txt_book.breaklines_count}个断点."
-      return true
+    if File.extname(source) == '.epub'
+      epub_book = EpubBook.new(source,options)
+      docbook_xml = epub_book.to_doc_book
+      if docbook_xml
+        write_doc_book(destination,docbook_xml)
+        puts "目录结构:"
+        puts epub_book.toc_to_text
+        return true
+      end
     else
-      return nil
+      content = case File.extname(source)
+        when '.html'
+          Utils.extract_text_from_file(source,'.html')
+        when '.txt'
+          File.open(source).read
+      end
+      txt_book = TxtBook.new(content,options)
+      docbook_xml = txt_book.to_doc_book
+      if docbook_xml
+        write_doc_book(destination,docbook_xml)
+        puts "目录结构:"
+        puts txt_book.toc_to_text
+        puts "共修复#{txt_book.breaklines_count}个断点."
+        return true
+      end
     end
+    return nil
   end
   # batch_extract_from_dir
@@ -262,6 +269,11 @@ module EbookTools
     end
   end
+  def write_doc_book(destination, docbook_xml)
+    FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
+    File.open(destination,'wb'){|file|file.write docbook_xml}
+  end
   # text_paras_repair
   # 对文本文件格式中的中断段落进行修复
   def text_paras_repair(source_file,target_file,options={})
@@ -271,21 +283,6 @@ module EbookTools
     File.open(target_file,'w'){|file| file.write content}
   end
-  def extract_text_from_file(filename,format)
-    txt_file = File.basename(filename,format)
-    if !filename.include?("'")
-      cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
-    elsif !filename.include?('"')
-      cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
-    else
-      cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
-    end
-    output = `#{cmd}`
-    content = File.open("#{txt_file}.txt").read
-    FileUtils.remove_file("#{txt_file}.txt",true)
-    return content
-  end
   # sanitize_for_epub_text
   def sanitize_for_epub_text(content)
     return content if content.blank?

data/lib/epub_book.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# encoding: utf-8
+require 'uuid'
+require 'cgi'
+# epub_book
+#  处理EPUB书的类。
+class EpubBook
+  attr_reader :title,:author,:publisher,:pubdate,:isbn,:content,:outline
+  def initialize(filename,options={})
+    raise '无效的文件' unless File.exists?(filename)
+    @title = options[:title]
+    @author = options[:author]
+    @publisher = options[:publisher]
+    @pubdate= options[:pubdate]
+    @isbn = options[:isbn]
+    text = Utils.extract_text_from_file(filename,'.epub')
+    unless  Utils.detect_utf8(text)
+      text = Utils.to_utf8(text)
+    end
+    text = preprocess_content(text)
+    @outline, @content = extract_for_epub_text(text)
+  end
+  def toc_to_text
+    outline
+  end
+  def to_doc_book
+    build_doc_book(@outline,@content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
+  end
+  private
+    def preprocess_content(content)
+      paras = extract_paras(content)
+      paras.join("\n")
+    end
+    def extract_paras(content)
+      paras = []
+      return paras if content.blank?
+      content.each_line do |line|
+        text = Utils.clean_text(line)
+        paras << text if text.length > 0
+      end
+      paras
+    end
+    def build_doc_book(outline,content,options={})
+      doc_toc = gen_docbook_toc(outline.split("\n"))
+      doc_content = gen_docbook_content(content)
+<<-EOS
+<?xml version="1.0" encoding="utf-8"?>
+      <book xmlns="http://docbook.org/ns/docbook" version="5.0" id="#{UUID.generate}">
+      <info>
+      <title>#{options[:title]}</title>
+      <authorgroup>
+      <author><personname>#{options[:author]}</personname></author>
+      </authorgroup>
+      <pubdate>#{options[:pubdate]}</pubdate>
+      <publisher><publishername>#{options[:publisher]}</publishername></publisher>
+      </info>
+      #{doc_toc}
+      #{doc_content}
+      </book>
+EOS
+    end
+    def gen_docbook_toc(toc)
+      "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
+    end
+    def gen_docbook_tocdiv(toc)
+      toc.map do |item|
+        text = Utils.escape_html(Utils.clean_text(item))
+        "<tocdiv><title>#{item}</title></tocdiv>"
+      end.join("")
+    end
+    def gen_docbook_content(content)
+      paras = extract_paras(content)
+      paras_content = paras.map do |para|
+        text = Utils.escape_html(Utils.clean_text(para))
+        "<para id='#{UUID.generate}'>#{text}</para>"
+      end.join("\n")
+      "<sect1>#{paras_content}</sect1>"
+    end
+    # extract_for_epub_text
+    def extract_for_epub_text(content)
+      return content if content.blank?
+      if outline_type?(content)
+        extract_outline_with_content(content)
+      elsif toc_type?(content)
+        extract_toc_with_content(content)
+      else
+        ['',content]
+      end
+    end
+    def extract_outline_with_content(content)
+      outline_flag = false
+      lines = []
+      outline = []
+      content.each_line do |line|
+        if line.strip.downcase == 'document outline'
+          outline_flag = true
+          next
+        end
+        unless outline_flag
+          lines << line
+        else
+          outline << line;
+        end
+      end
+      [outline.join("") , lines.join("")]
+    end
+    def extract_toc_with_content(content)
+      toc = []
+      lines = []
+      paras = extract_paras(content)
+      index = paras.index('Content')
+      paras = paras[(index+1)..-1]
+      point = nil
+      paras.each_with_index do |para, index|
+        if toc.include?(para)
+          point = index
+          break
+        else
+          toc << para
+        end
+      end
+      lines = paras[(point+1)..-1]
+      [toc.join("\n") , lines.join("\n")]
+    rescue
+      ['', content]
+    end
+    def outline_type?(content)
+      content.each_line do |line|
+          return true if line.strip.downcase  == 'document outline'
+      end
+      false
+    end
+    def toc_type?(content)
+      toc_flag = false
+      toc_flag1 = false
+      content.each_line do |line|
+          toc_flag = true if line.strip.downcase == '目录'
+          toc_flag1 = true if line.strip.downcase == 'content'
+      end
+      toc_flag && toc_flag1
+    end
+end

data/lib/paras_in_mongo.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'moped'
 require 'nokogiri'
 require 'active_support'
+require 'active_support/core_ext/hash'
 module ParasInMongo
   extend self

data/lib/txt_book.rb CHANGED Viewed

@@ -87,18 +87,22 @@ class TxtBook
   end
   def toc
-    @toc ||= extract_toc_from_struct(struct_content)
+    @toc ||= extract_toc_from_struct(struct_content) if struct_content
   end
   def toc_to_text
-    gen_toc(toc) do |item,children|
-      "#{item[:title]}\n#{children}"
+    if toc
+      gen_toc(toc) do |item,children|
+        "#{item[:title]}\n#{children}"
+      end
     end
   end
   def to_doc_book
     if struct_content
       build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
+    else
+      build_doc_book(content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
     end
   end
@@ -184,12 +188,15 @@ class TxtBook
     marked_content
   end
-  def build_doc_book(struct,options={})
-    doc_toc = gen_docbook_toc(toc)
+  def build_doc_book(content,options={})
+    doc_toc = gen_docbook_toc(toc) if toc
-    struct = struct.map{|item| item if item.is_a?(Hash)}.compact
-    doc_content = gen_docbook_content(struct)
+    doc_content =  if content.is_a?(Array)
+      struct = content.map{|item| item if item.is_a?(Hash)}.compact
+      gen_docbook_content_with_struct(struct)
+    else
+      gen_docbook_content(content)
+    end
 <<-EOS
 <?xml version="1.0" encoding="utf-8"?>
@@ -208,6 +215,8 @@ class TxtBook
 EOS
   end
   def build_struct(content)
     stack = Array.new(8)
     struct = []
@@ -314,18 +323,20 @@ EOS
   end
   def extract_toc_from_struct(struct)
-    toc = []
-    struct.each do |item|
-      if item.is_a?(Hash)
-        children = []
-        if item[:children].any?
-          children = extract_toc_from_struct(item[:children])
+    if struct
+      toc = []
+      struct.each do |item|
+        if item.is_a?(Hash)
+          children = []
+          if item[:children].any?
+            children = extract_toc_from_struct(item[:children])
+          end
+          item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
+          toc << item_hash
         end
-        item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
-        toc << item_hash
       end
+      toc
     end
-    toc
   end
   def gen_docbook_toc(toc)
@@ -350,7 +361,16 @@ EOS
     end
   end
-  def gen_docbook_content(struct)
+  def gen_docbook_content(content)
+    paras = extract_paras(content)
+    paras_content = paras.map do |para|
+      text = Utils.escape_html(Utils.clean_text(para))
+      "<para id='#{UUID.generate}'>#{text}</para>"
+    end.join("\n")
+    "<sect1>#{paras_content}</sect1>"
+  end
+  def gen_docbook_content_with_struct(struct)
     content = []
     struct.each do |item|
       if item.is_a?(Hash)

data/lib/utils.rb CHANGED Viewed

@@ -256,4 +256,19 @@ module Utils
     sections
   end
+  def extract_text_from_file(filename,format)
+    txt_file = File.basename(filename,format)
+    if !filename.include?("'")
+      cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
+    elsif !filename.include?('"')
+      cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
+    else
+      cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
+    end
+    output = `#{cmd}`
+    content = File.open("#{txt_file}.txt").read
+    FileUtils.remove_file("#{txt_file}.txt",true)
+    return content
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ebook_tools
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.5
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-05 00:00:00.000000000 Z
+date: 2013-06-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: uuid
@@ -156,7 +156,7 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 description: 电子书工具集.
-email: aaron@nonobo.com
+email: yalong1976@gmail.com
 executables:
 - ebook_tools
 - para_import_mongo
@@ -171,15 +171,16 @@ files:
 - bin/doc_book_import_mongo
 - bin/para_import_scheduling
 - bin/xml2json
+- lib/doc_book_in_mongo.rb
 - lib/ebook_tools.rb
-- lib/txt_book.rb
+- lib/epub.rb
+- lib/epub_book.rb
 - lib/header_detect.rb
+- lib/paras_in_mongo.rb
 - lib/pdf.rb
 - lib/txt.rb
-- lib/epub.rb
+- lib/txt_book.rb
 - lib/utils.rb
-- lib/paras_in_mongo.rb
-- lib/doc_book_in_mongo.rb
 - workers/para_import_worker.rb
 - ebook_tools.gemspec
 homepage: