RubyGems - ebook_tools - Versions diffs - 0.0.1 - Mend

ebook_tools 0.0.1

Files changed (12) hide show

data/lib/epub.rb ADDED

@@ -0,0 +1,104 @@
+# encoding: UTF-8
+require 'gepub'
+require 'uuid'
+module EPUB
+  extend self
+  include Utils
+  # write_epub
+  # parameters:
+  #  +epub_file+   指定生成的epub文件
+  #  +options+     可选参数
+  #     :files     指定打包到epub中的文件集合
+  #     :title     epub标题
+  #     :author    epub作者
+  def write_epub(epub_file,options={})
+    files = options[:files] || []
+    nav,files = extract_nav_from_files(files)
+    book = GEPUB::Book.new
+    book.set_main_id UUID.generate, {}
+    book.add_title options[:title]
+    book.version = '3.0'
+    book.instance_variable_get('@package').epub_backward_compat = false
+    book.add_creator options[:author]
+    book.publisher='www.nonobo.com'
+    book.add_item(File.basename(nav),nav,'nav').add_property('nav')
+    files.each do |file|
+      if File.extname(file) == '.html' || File.extname(file) == ".htm"
+        book.ordered{
+          book.add_item(File.basename(file),file)
+        }
+      else
+        book.add_item(File.basename(file),file)
+      end
+    end
+    Utils.make_destination_dir(epub_file)
+    book.generate_epub(epub_file)
+  end
+  # 生成EPUB3导航文件
+  def gen_nav_file(html_file,sections,options={})
+    temp_dir = options[:dir] || File.dirname(html_file)
+    nav_html = File.join(temp_dir,'nav.html')
+    html_content = gen_nav_file_content(html_file,sections,options={})
+    Utils.write_file(html_content,nav_html)
+    nav_html
+  end
+  def gen_nav_file_content(html_file,sections,options={})
+    opts = {:title => 'Table Of Contents'}.merge(options)
+    html_content =<<-EOS
+<?xml version="1.0" encoding="utf-8"?>
+    <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
+      <head>
+        <META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+      </head>
+      <body>
+        <nav  epub:type="toc" id="toc">
+          <h1>#{opts[:title]}</h1>
+          <ol>
+            #{gen_nav_items("",html_file,sections)}
+          </ol>
+        </nav>
+      </body>
+</html>
+    EOS
+    html_content
+  end
+  private
+  def extract_nav_from_files(files=[])
+    nav = nil
+    files.dup.each do |f|
+      nav = f if File.basename(f,'.html') == 'nav'
+    end
+    files.delete_if{|file| File.basename(file,'.html') == 'nav'}
+    [nav,files]
+  end
+  def gen_nav_items(items,filename,sections)
+    item_set = []
+    sections.each do |section|
+      sub_items = ""
+      if section[:sub_sections]
+        sub_items = %Q(<ol>#{gen_sub_nav_items("",filename,section[:sub_sections])}</ol>)
+      end
+      item_set << %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
+    end
+    item_set.join("")
+  end
+  def gen_sub_nav_items(items,filename,sections)
+    sections.each do |section|
+      sub_items = ""
+      if section[:sub_sections]
+        sub_items = %Q(<ol>#{gen_sub_nav_items(items,filename,section[:sub_sections])}</ol>)
+      end
+      items = items +  %Q(<li><a href="#{File.basename(filename)}\##{section[:page_num]}">#{section[:title]}</a>#{sub_items}</li>)
+    end
+    items
+  end
+end

data/lib/extract_book_struct.rb ADDED

@@ -0,0 +1,415 @@
+# encoding: UTF-8
+# =ExtractBookStruct
+# ExtractBookStruct的目的是提取书的结构信息。
+# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
+#   1. 文档的编码格式必须是UTF-8或GB2312，推荐使用UTF-8格式
+#   2. 文档的内容只包含书内容部分（书名、作者、目录等信息应该不包含在文档内）
+#   3. 文档的段落应该完整（有些PDF转换过来的文档会破坏句子，需要进行预处理）
+#   4. 文档必须符合正常的文档流（错位的章节段落等情况将影响正常的结构提取）
+#   5. 文档需要包含结构信息（例如： 卷、篇、部分、章（回）节或者有连续的序号）
+#   6. 每个结构信息都应该独立成行。
+#
+#
+# ==接口
+#
+# === ExtractBookStruct.from_txt
+#  从文本文件中提取目录结构
+#
+# === ExtractBookStruct.from_epub
+#  从EPUB文件中提取目录结构
+#
+# === ExtractBookStruct.from_html
+#  从HTML中提取目录结构
+require 'uuid'
+require 'cgi'
+require 'iconv'
+module ExtractBookStruct
+  extend self
+  extend HeaderDetect
+  def from_txt(filename,options={})
+    content = File.open(filename).read
+    extract_book_struct(content,options)
+  end
+  def from_html(filename,options={})
+    content = extract_text_from_file(filename,'.html')
+    extract_book_struct(content,options)
+  end
+  def from_epub(filename,options={})
+    content = extract_text_from_file(filename,'.epub')
+    extract_book_struct(content,options)
+  end
+  def extract_book_struct(content,options={})
+    unless Utils.detect_utf8(content)
+      content = Utils.to_utf8(content)
+    end
+    content = sanitize_for_epub_text(content)
+    paras = extract_paras(content)
+    # 检查书类型（text,digital,hybrid)
+    format = options[:format] || detect_struct_type(paras)
+    case format
+    when :text
+      extract_text_book_struct(paras,options)
+    when :digital
+      extract_digital_book_struct(paras,options)
+    when :hybrid
+      extract_hybrid_book_struct(paras,options)
+    else
+      return nil
+    end
+  end
+  def extract_text_from_file(filename,format)
+    txt_file = File.basename(filename,format)
+    cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
+    output = `#{cmd}`
+    content = File.open("#{txt_file}.txt").read
+    FileUtils.remove_file("#{txt_file}.txt",true)
+    return content
+  end
+  def extract_paras(content)
+    paras = []
+    return paras if content.blank?
+    content.each_line do |line|
+      text = Utils.clean_text(line)
+      paras << text if text.length > 0
+    end
+    paras
+  end
+  def detect_struct_type(paras)
+    text_flag = false
+    digital_flag = false
+    paras.each do |para|
+      if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
+        text_flag = true
+      end
+      if guess_digital_header?(para)
+        digital_flag = true
+      end
+    end
+    if text_flag && digital_flag
+      :hybrid
+    elsif text_flag
+      :text
+    elsif digital_flag
+      :digital
+    else
+      :unknown
+    end
+  end
+  # 从text类型书中提取结构
+  def extract_text_book_struct(content,options={})
+    # 标注结构信息
+    marked_content = mark_struct_info(content)
+    # 构建书结构
+    struct = build_struct(marked_content)
+    # 修正结构
+    revised_struct = revise_struct(struct)
+    # 生成docbook
+    build_doc_book(revised_struct,options)
+  end
+  # 从数字类型书中提取结构
+  def extract_digital_book_struct(content,options={})
+    marked_content = mark_digital_struct_info(content)
+    # 构建书结构
+    struct = build_struct(marked_content)
+    # 修正结构
+    revised_struct = revise_struct(struct)
+    # 生成docbook
+    build_doc_book(revised_struct,options)
+  end
+  # 从混合类型书中提取结构
+  def extract_hybrid_book_struct(content,options={})
+    marked_content = mark_hybrid_struct_info(content)
+    # 构建书结构
+    struct = build_struct(marked_content)
+    # 修正结构
+    revised_struct = revise_struct(struct)
+    # 生成docbook
+    build_doc_book(revised_struct,options)
+  end
+  # 标注结构信息
+  #  将内容以行分割顺序存放在数组中，并对行猜测是否为结构信息，将猜测的结果以哈希的形式保存在数组中。
+  def mark_struct_info(content)
+    marked_content = []
+    content.each do |text|
+      if text.length > 0
+        type = guess_header?(text)
+        if type
+          marked_content << {:title=>text,:type=>type}
+        else
+          marked_content << text
+        end
+      end
+    end
+    marked_content
+  end
+  def mark_hybrid_struct_info(content)
+    marked_content = []
+    content.each do |text|
+      if text.length > 0
+        type = guess_header?(text)
+        if type
+          marked_content << {:title=>text,:type=>type}
+        else
+          type = guess_digital_section?(text)
+          if type
+            marked_content << {:title=>text,:type=>type}
+          else
+            marked_content << text
+          end
+        end
+      end
+    end
+    marked_content
+  end
+  def mark_digital_struct_info(content)
+    marked_content = []
+    content.each do |text|
+      if text.length > 0
+        type = guess_header?(text)
+        if type
+          marked_content << {:title=>text,:type=>type}
+        else
+          type = guess_digital_header?(text)
+          if type
+            marked_content << {:title=>text,:type=>type}
+          else
+            marked_content << text
+          end
+        end
+      end
+    end
+    marked_content
+  end
+  # 修正结构 TODO
+  def revise_struct(struct)
+    struct
+  end
+  def build_doc_book(struct,options={})
+    toc = extract_toc_from_struct(struct)
+    doc_toc = gen_docbook_toc(toc)
+    struct = struct.map{|item| item if item.is_a?(Hash)}.compact
+    doc_content = gen_docbook_content(struct)
+<<-EOS
+<?xml version="1.0" encoding="utf-8"?>
+    <book xmlns="http://docbook.org/ns/docbook" version="5.0">
+    <info>
+    <title>#{options[:title]}</title>
+    <author>#{options[:author]}</author>
+    <pubdate>#{options[:pubdate]}</pubdate>
+    <publisher>#{options[:publisher]}</publisher>
+    </info>
+    #{doc_toc}
+    #{doc_content}
+    </book>
+EOS
+  end
+  def build_struct(content)
+    stack = Array.new(8)
+    struct = []
+    content.each do |line|
+      if line.is_a?(Hash)
+        case type = line[:type].to_sym
+        when :volume
+          7.downto(0) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :part
+          7.downto(1) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :chapter,:appendix,:index,:glossary,:preface,:afterword
+          7.downto(2) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect1
+          if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
+            stack[2][:children] << line[:title]
+          else
+            7.downto(3) do |index|
+              closed_node(struct,stack[0..index])
+              stack[index]=nil
+            end
+            stack[3] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+          end
+        when :sect2
+          7.downto(4) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[4] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect3
+          7.downto(5) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[5] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect4
+          7.downto(6) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[6] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect5
+          closed_node(struct,stack)
+          stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        end
+      else
+        if stack[7]
+          stack[7][:children] << line
+        elsif stack[6]
+          stack[6][:children] << line
+        elsif stack[5]
+          stack[5][:children] << line
+        elsif stack[4]
+          stack[4][:children] << line
+        elsif stack[3]
+          stack[3][:children] << line
+        elsif stack[2]
+          stack[2][:children] << line
+        elsif stack[1]
+          stack[1][:children] << line
+        elsif stack[0]
+          stack[0][:children] << line
+        else
+          struct << line
+        end
+      end
+    end
+    7.downto(0) do |index|
+      closed_node(struct,stack[0..index])
+      stack[index] = nil
+    end
+    struct
+  end
+  def closed_node(struct,stack)
+    last = stack.pop
+    if last
+      result = false
+      while stack.any?
+        item = stack.pop
+        if item
+          item[:children] << last
+          result = true
+          break
+        end
+      end
+      if result == false
+        struct << last
+      end
+    end
+  end
+  def extract_toc_from_struct(struct)
+    toc = []
+    struct.each do |item|
+      if item.is_a?(Hash)
+        children = []
+        if item[:children].any?
+          children = extract_toc_from_struct(item[:children])
+        end
+        item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
+        toc << item_hash
+      end
+    end
+    toc
+  end
+  def gen_docbook_toc(toc)
+    "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
+  end
+  def gen_docbook_tocdiv(toc)
+    doc_toc = []
+    toc.each do |item|
+      children = ""
+      if item[:children].any?
+        children = gen_docbook_tocdiv(item[:children])
+      end
+      doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
+    end
+    doc_toc.join("")
+  end
+  def gen_docbook_content(struct)
+    content = []
+    struct.each do |item|
+      if item.is_a?(Hash)
+        children = ""
+        if item[:children].any?
+          children = gen_docbook_content(item[:children])
+        end
+        case item[:type]
+        when 'volume','part'
+          content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
+        when 'chapter','appendix','glossary','index','preface'
+          content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
+        when 'sect1','sect2','sect3','sect4','sect5'
+          content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
+        end
+      else
+        text = Utils.escape_html(Utils.clean_text(item))
+        if text.length > 0
+          content << "<para id='#{UUID.generate}'>#{text}</para>"
+        end
+      end
+    end
+    content.join("\n")
+  end
+  # sanitize_for_epub_text
+  def sanitize_for_epub_text(content)
+    return content if content.blank?
+    lines = []
+    content.each_line do |line|
+      unless line.downcase.include?('document outline')
+        lines << line
+      else
+        break;
+      end
+    end
+    lines.join("")
+  end
+end