RubyGems - extract_book_struct - Versions diffs - 0.0.3 - Mend

extract_book_struct 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/CHANGELOG +6 -0
data/README +76 -0
data/bin/batch_extract_book_struct +58 -0
data/bin/extract_book_struct +95 -0
data/lib/batch_extract.rb +76 -0
data/lib/extract_book_struct.rb +594 -0
metadata +86 -0

data/CHANGELOG ADDED Viewed

@@ -0,0 +1,6 @@
+0.0.2 2013.3.28
+  * fix GB2312格式的文本文件提取错误
+  * 新增batch_extract_book_struct命令行工具
+0.0.1 2013.3.28
+  init release

data/README ADDED Viewed

@@ -0,0 +1,76 @@
+# encoding: UTF-8
+# = ExtractBookStruct
+# ExtractBookStruct的目的是从各类电子书内容中提取书的结构信息。目前支持txt,epub,html。
+# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
+#   1. 文档的编码格式必须是UTF-8或GB2312，推荐使用UTF-8格式
+#   2. 文档的内容只包含书内容部分（书名、作者、目录等信息应该不包含在文档内）
+#   3. 文档的段落应该完整（有些PDF转换过来的文档会破坏句子，需要进行预处理）
+#   4. 文档必须符合正常的文档流（错位的章节段落等情况将影响正常的结构提取）
+#   5. 文档需要包含结构信息（例如： 卷、篇、部分、章（回）节或者有连续的序号）
+#   6. 每个结构信息都应该独立成行。
+#
+# 文档结构信息分析
+#   一本书在编排的时候会有自己的结构信息，这些结构信息通常通过卷、篇、部分、章(回)节等表述，也会使用序号的方式表述。总体上可以分为以下几种：
+#  1. 文本描述(text)： 按卷、部分(篇)、章（回）、节等文字表述
+#  2. 数字描述(digital)： 所有结构信息都是按照数字序号表示，比如 1 xxxxx; 1.1 xxxxx
+#  3. 混合描述(hybrid)：章按照文字表述，节按照序号表示，比如 1.1 xxxxxx
+#   根据不同的类型，对结构信息的提取采用不同的处理手段。
+#
+# 有效的标题信息应该符合以下规则:
+#  1. 标题应该不包含完整的句子（应该不包含句子分隔符，例如“。","!"等）
+#  2. 应该包含结构信息表述，具体如下：
+#    文本描述:
+#     卷:  以"第xxx卷"开始
+#          以"卷"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          以"volume"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     部分（篇）: 以"第xxx部"或"第xxx篇"开始
+#               以"part"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     章(回）: 以"第xxx章"或"第xxx回"开始
+#              以"chapter"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     节:  以"第xxx节"开始
+#     前言: 以"前"开始，以"言"结束，中间加入空白字符。例如"前言"，"前  言"等。
+#           以"序"开始，以"言"结束，中间加入空白字符。例如"序言"，"序  言"等。
+#           单个"序"
+#           以"序"或"序言"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "preface"
+#          "foreword"
+#           以"preface"或"foreword"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     索引: 以"索"开始，以"引"结束，中间加入空白字符。例如"索引"，"索  引"等。
+#           以"索引"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "index"
+#           以"index"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     附录: 以"附"开始，以"录"结束，中间加入空白字符。例如"附录"，"附  录"等。
+#           以"附录"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "appendix"
+#           以"appendix"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     术语: 以"术"开始，以"语"结束，中间加入空白字符。例如"术语"，"术  语"等。
+#           以"术语"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "glossary"
+#           以"glossary"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#
+#    数字描述:
+#      以数字序号层级表达，数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
+#
+# ==API接口
+#
+# === ExtractBookStruct.from_txt
+#  从文本文件中提取目录结构，使用示例:
+#     ExtractBookStruct.from_txt('1.txt',{:title=>'title',:author=>'author'})
+#
+# === ExtractBookStruct.from_epub
+#  从EPUB文件中提取目录结构，使用示例:
+#      ExtractBookStruct.from_epub('1.epub',{:title=>'title',:author=>'author'})
+#
+# === ExtractBookStruct.from_html
+#  从HTML中提取目录结构,使用示例:
+#      ExtractBookStruct.from_html('1.html',{:title=>'title',:author=>'author'})
+#
+# == 命令行工具
+#   extract_book_struct，使用示例:
+#      extract_book_struct '1.txt', '1.xml'
+#
+# == 依赖
+#   ExtractBookStruct依赖以下工具和包:
+#     ebook-convert: calibre cli tools.
+#     uuid: ruby gem.
+#     iconv: ruby gem.

data/bin/batch_extract_book_struct ADDED Viewed

@@ -0,0 +1,58 @@
+#!/usr/bin/env ruby
+# encoding: UTF-8
+require 'rubygems'
+require 'optparse'
+require File.join(File.expand_path('../../',__FILE__),'lib','batch_extract')
+def help
+  puts <<-EOF
+  extract_book_struct: 批量提取书结构信息
+  usage:
+    batch_extract_book_struct [options] source_dir destination_dir
+  source_dir： 指定需要提取结构信息的书所在目录
+  destination_dir: 指定提取的书结构信息所输出的文件目录
+  options:
+    -F,--format  指定要提取书的格式
+  适用对象要求:
+  1. 编码格式为utf-8
+  EOF
+  exit
+end
+options = {}
+opts = OptionParser.new do |opts|
+  opts.on('-F format','--format format','format') do |format|
+    options[:format] = format
+  end
+  opts.on('-h','--help') do
+    help
+    exit
+  end
+end
+opts.parse ARGV
+source_dir = ARGV[-2]
+destination_dir = ARGV[-1]
+if source_dir.nil? || destination_dir.nil?
+  help
+  exit
+end
+unless File.directory?(source_dir)
+  puts "error: source_dir #{source_dir} not is directory"
+else
+  begin
+    FileUtils.mkdir_p(destination_dir) unless Dir.exists?(destination_dir)
+  rescue
+    puts "error: destination_dir #{destination_dir} not created"
+    exit
+  end
+  BatchExtract.batch_extract_from_dir(source_dir,destination_dir,options)
+end

data/bin/extract_book_struct ADDED Viewed

@@ -0,0 +1,95 @@
+#!/usr/bin/env ruby
+# encoding: UTF-8
+require 'rubygems'
+require 'optparse'
+require File.join(File.expand_path('../../',__FILE__),'lib','extract_book_struct')
+def help
+  puts <<-EOF
+  extract_book_struct: 提取书结构信息
+  usage:
+    extract_book_struct [options] source_file docbook_file
+  source_file： 指定需要提取结构信息的书文件
+  docbook_file: 指定提取的书结构信息所输出的文件
+  options:
+    -T <title>, --title <title> : 书的标题
+    -A <author>, --author <author> : 书作者
+    --pubdate <pubdate> : 出版时间
+    --publisher <publisher> : 出版社
+  适用对象要求:
+  1. 编码格式为utf-8
+  EOF
+  exit
+end
+options = {}
+opts = OptionParser.new do |opts|
+  opts.on('-T title','--title title','title') do |title|
+    options[:title] = title
+  end
+  opts.on('-A author','--author author','author') do |author|
+    options[:author] = author
+  end
+  opts.on('--publisher publisher','publisher') do |publisher|
+    options[:publisher] = publisher
+  end
+  opts.on('--pubdate pubdate','pubdate') do |pubdate|
+    options[:pubdate] = pubdate
+  end
+  opts.on('-h','--help') do
+    help
+    exit
+  end
+end
+opts.parse ARGV
+source_file = ARGV[-2]
+docbook_file = ARGV[-1]
+if source_file.nil? || docbook_file.nil?
+  help
+  exit
+end
+unless File.exists?(source_file)
+  puts "error: source_file #{source_file} no found"
+else
+  begin
+    dest_path = File.dirname(docbook_file)
+    FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
+  rescue
+    puts "error: docbook_file #{docbook_file} not created"
+    exit
+  end
+  ext_name = File.extname(source_file).downcase
+  options[:title] ||= File.basename(source_file,ext_name)
+  unless ['.html','.txt','.epub'].include?(ext_name)
+    puts "source_file不是允许的文件格式: txt,html,epub"
+    exit
+  end
+  begin
+    docbook_xml = case ext_name
+      when '.html'
+        ExtractBookStruct.from_html(source_file,options)
+      when '.txt'
+        ExtractBookStruct.from_txt(source_file,options)
+      when '.epub'
+        ExtractBookStruct.from_epub(source_file,options)
+    end
+    if docbook_xml
+      File.open(docbook_file,'wb'){|file|file.write docbook_xml}
+      puts "success: extract book struct  successfully!"
+    end
+  rescue => e
+    puts "error: #{source_file} \n#{e.backtrace.join("\n")}"
+  end
+end

data/lib/batch_extract.rb ADDED Viewed

@@ -0,0 +1,76 @@
+# encoding: UTF-8
+require 'pathname'
+require File.join(File.expand_path('../../',__FILE__),'lib','extract_book_struct')
+module BatchExtract
+  extend self
+  # batch_extract_from_dir
+  #  batch extract book struct form dir
+  # parameters:
+  #   +source+     source directory
+  #   +destination+   output directory
+  #   +options+        optional parameter.
+  #      :format     指定需要提取结构的文件后缀名，例如要从所有txt文件中提取，通过:format=>'.txt'指定
+  def batch_extract_from_dir(source,destination,options={})
+    format = options.delete(:format)
+    files = scan_file_from_dir(source,{:format=>format})
+    files.each do |file|
+      extname = File.extname(file)
+      basename = File.basename(file,extname)
+      dest_file = File.join(File.dirname(File.join(destination,file.gsub(source,''))),"#{basename}.xml")
+      puts "start extract #{file} ..."
+      begin
+        docbook_xml = case extname
+        when '.html'
+          ExtractBookStruct.from_html(file,options)
+        when '.txt'
+          ExtractBookStruct.from_txt(file,options)
+        when '.epub'
+          ExtractBookStruct.from_epub(file,options)
+        else
+          nil
+        end
+        if docbook_xml
+          File.open(dest_file,'wb'){|file|file.write docbook_xml}
+          puts "success: extract book struct  successfully!"
+        end
+      #rescue => e
+      #  puts "error: #{file} \n#{e.backtrace.join("\n")}"
+      end
+    end
+  end
+  # scan_file_from_dir
+  # 遍历目录下的文件
+  # parameters:
+  #   +dir+       需遍历的目录
+  #   +options+   可选参数
+  #      :format     指定需要遍历的文件后缀名，例如要遍历所有pdf文件，通过:format=>'.pdf'指定
+  def scan_file_from_dir(dir,options={})
+    files = []
+    walk_dir(dir,options) do |file|
+      files << file.to_s
+    end
+    files
+  end
+  def walk_dir(path_str,options={})
+    path = Pathname.new(path_str)
+    format = options[:format]
+    path.children.each do |entry|
+      if entry.directory?
+        walk_dir(entry) {|x| yield(x)}
+      elsif entry.file?
+        if format
+          if entry.extname == format
+            yield entry
+          end
+        else
+          yield entry
+        end
+      end
+    end
+  end
+end

data/lib/extract_book_struct.rb ADDED Viewed

@@ -0,0 +1,594 @@
+# encoding: UTF-8
+# =ExtractBookStruct
+# ExtractBookStruct的目的是提取书的结构信息。
+# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
+#   1. 文档的编码格式必须是UTF-8或GB2312，推荐使用UTF-8格式
+#   2. 文档的内容只包含书内容部分（书名、作者、目录等信息应该不包含在文档内）
+#   3. 文档的段落应该完整（有些PDF转换过来的文档会破坏句子，需要进行预处理）
+#   4. 文档必须符合正常的文档流（错位的章节段落等情况将影响正常的结构提取）
+#   5. 文档需要包含结构信息（例如： 卷、篇、部分、章（回）节或者有连续的序号）
+#   6. 每个结构信息都应该独立成行。
+#
+# 文档结构信息分析
+#   一本书在编排的时候会有自己的结构信息，这些结构信息通常通过卷、篇、部分、章(回)节等表述，也会使用序号的方式表述。总体上可以分为以下几种：
+#  1. 文本描述(text)： 按卷、部分(篇)、章（回）、节等文字表述
+#  2. 数字描述(digital)： 所有结构信息都是按照数字序号表示，比如 1 xxxxx; 1.1 xxxxx
+#  3. 混合描述(hybrid)：章按照文字表述，节按照序号表示，比如 1.1 xxxxxx
+#   根据不同的类型，对结构信息的提取采用不同的处理手段。
+#
+# 有效的标题信息应该符合以下规则:
+#  1. 标题应该不包含完整的句子（应该不包含句子分隔符，例如“。","!"等）
+#  2. 应该包含结构信息表述，具体如下：
+#    文本描述:
+#     卷:  以"第xxx卷"开始
+#          以"卷"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          以"volume"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     部分（篇）: 以"第xxx部"或"第xxx篇"开始
+#               以"part"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     章(回）: 以"第xxx章"或"第xxx回"开始
+#              以"chapter"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     节:  以"第xxx节"开始
+#     前言: 以"前"开始，以"言"结束，中间加入空白字符。例如"前言"，"前  言"等。
+#           以"序"开始，以"言"结束，中间加入空白字符。例如"序言"，"序  言"等。
+#           单个"序"
+#           以"序"或"序言"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "preface"
+#          "foreword"
+#           以"preface"或"foreword"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     索引: 以"索"开始，以"引"结束，中间加入空白字符。例如"索引"，"索  引"等。
+#           以"索引"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "index"
+#           以"index"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     附录: 以"附"开始，以"录"结束，中间加入空白字符。例如"附录"，"附  录"等。
+#           以"附录"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "appendix"
+#           以"appendix"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     术语: 以"术"开始，以"语"结束，中间加入空白字符。例如"术语"，"术  语"等。
+#           以"术语"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "glossary"
+#           以"glossary"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#
+#    数字描述:
+#      以数字序号层级表达，数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
+#
+# ==接口
+#
+# === ExtractBookStruct.from_txt
+#  从文本文件中提取目录结构
+#
+# === ExtractBookStruct.from_epub
+#  从EPUB文件中提取目录结构
+#
+# === ExtractBookStruct.from_html
+#  从HTML中提取目录结构
+require 'uuid'
+require 'cgi'
+require 'iconv'
+module ExtractBookStruct
+  extend self
+  def from_txt(filename,options={})
+    content = File.open(filename).read
+    unless detect_utf8(content)
+      content = to_utf8(content)
+    end
+    content = sanitize_for_epub_text(content)
+    paras = extract_paras(content)
+    extract_book_struct(paras,options)
+  end
+  def from_html(filename,options={})
+    content = extract_text_from_file(filename,'.html')
+    content = to_utf8(content) unless detect_utf8(content)
+    paras = extract_paras(content)
+    extract_book_struct(paras,options)
+  end
+  def from_epub(filename,options={})
+    content = extract_text_from_file(filename,'.epub')
+    content = to_utf8(content) unless detect_utf8(content)
+    paras = extract_paras(content)
+    extract_book_struct(paras,options)
+  end
+  def extract_book_struct(paras,options={})
+    # 检查书类型（text,digital,hybrid)
+    format = options[:format] || detect_struct_type(paras)
+    case format
+    when :text
+      extract_text_book_struct(paras,options)
+    when :digital
+      extract_digital_book_struct(paras,options)
+    when :hybrid
+      extract_hybrid_book_struct(paras,options)
+    else
+      puts "警告: 没有检测到书结构信息."
+      return nil
+    end
+  end
+  def extract_text_from_file(filename,format)
+    txt_file = File.basename(filename,format)
+    cmd = "ebook-convert #{filename} #{txt_file}.txt"
+    output = `#{cmd}`
+    content = File.open("#{txt_file}.txt").read
+    FileUtils.remove_file("#{txt_file}.txt",true)
+    sanitize_for_epub_text(content)
+  end
+  def extract_paras(content)
+    paras = []
+    content.each_line do |line|
+      text = clean_text(line)
+      paras << text if text.length > 0
+    end
+    paras
+  end
+  def detect_struct_type(paras)
+    text_flag = false
+    digital_flag = false
+    paras.each do |para|
+      if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
+        text_flag = true
+      end
+      if guess_digital_head_line?(para)
+        digital_flag = true
+      end
+    end
+    if text_flag && digital_flag
+      :hybrid
+    elsif text_flag
+      :text
+    elsif digital_flag
+      :digital
+    else
+      :unknown
+    end
+  end
+  # 从text类型书中提取结构
+  def extract_text_book_struct(content,options={})
+    # 标注结构信息
+    marked_content = mark_struct_info(content)
+    # 构建书结构
+    struct = build_struct(marked_content)
+    # 修正结构
+    revised_struct = revise_struct(struct)
+    # 生成docbook
+    build_doc_book(revised_struct,options)
+  end
+  # 从数字类型书中提取结构
+  def extract_digital_book_struct(content,options={})
+    marked_content = mark_digital_struct_info(content)
+    # 构建书结构
+    struct = build_struct(marked_content)
+    # 修正结构
+    revised_struct = revise_struct(struct)
+    # 生成docbook
+    build_doc_book(revised_struct,options)
+  end
+  # 从混合类型书中提取结构
+  def extract_hybrid_book_struct(content,options={})
+    marked_content = mark_hybrid_struct_info(content)
+    # 构建书结构
+    struct = build_struct(marked_content)
+    # 修正结构
+    revised_struct = revise_struct(struct)
+    # 生成docbook
+    build_doc_book(revised_struct,options)
+  end
+  # 标注结构信息
+  #  将内容以行分割顺序存放在数组中，并对行猜测是否为结构信息，将猜测的结果以哈希的形式保存在数组中。
+  def mark_struct_info(content)
+    marked_content = []
+    content.each do |text|
+      if text.length > 0
+        type = guess_head_line?(text)
+        if type
+          marked_content << {:title=>text,:type=>type}
+        else
+          marked_content << text
+        end
+      end
+    end
+    marked_content
+  end
+  def mark_hybrid_struct_info(content)
+    marked_content = []
+    content.each do |text|
+      if text.length > 0
+        type = guess_head_line?(text)
+        if type
+          marked_content << {:title=>text,:type=>type}
+        else
+          type = guess_digital_section?(text)
+          if type
+            marked_content << {:title=>text,:type=>type}
+          else
+            marked_content << text
+          end
+        end
+      end
+    end
+    marked_content
+  end
+  def mark_digital_struct_info(content)
+    marked_content = []
+    content.each do |text|
+      if text.length > 0
+        type = guess_head_line?(text)
+        if type
+          marked_content << {:title=>text,:type=>type}
+        else
+          type = guess_digital_head_line?(text)
+          if type
+            marked_content << {:title=>text,:type=>type}
+          else
+            marked_content << text
+          end
+        end
+      end
+    end
+    marked_content
+  end
+  # 修正结构 TODO
+  def revise_struct(struct)
+    struct
+  end
+  def build_doc_book(struct,options={})
+    toc = extract_toc_from_struct(struct)
+    doc_toc = gen_docbook_toc(toc)
+    struct = struct.map{|item| item if item.is_a?(Hash)}.compact
+    doc_content = gen_docbook_content(struct)
+<<-EOS
+<?xml version="1.0" encoding="utf-8"?>
+    <book xmlns="http://docbook.org/ns/docbook" version="5.0">
+    <info>
+    <title>#{options[:title]}</title>
+    <author>#{options[:author]}</author>
+    <pubdate>#{options[:pubdate]}</pubdate>
+    <publisher>#{options[:publisher]}</publisher>
+    </info>
+    #{doc_toc}
+    #{doc_content}
+    </book>
+EOS
+  end
+  def guess_volume?(text,options={})
+    return false if hav_complete_sentence?(text)
+    return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
+    text = text.downcase
+    return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+  end
+  def guess_part?(text,options={})
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^第.{1,3}[部篇]/
+    text = text.downcase
+    return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+  end
+  def guess_chapter?(text)
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^第.{1,4}[章回]/
+    text = text.downcase
+    return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+  end
+  def guess_section?(text)
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^第.{1,3}[节]/
+  end
+  def guess_preface?(text)
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^前\s*言$/
+    return true if text =~ /^序\s*言$/
+    return true if text =~ /^序$/
+    return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+    text = text.downcase
+    return true if text =~ /^preface$/
+    return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+    return true if text =~ /^foreword$/
+    return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+  end
+  def guess_index?(text)
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^索\s*引$/
+    return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+    text = text.downcase
+    return true if text =~ /^index$/
+    return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+  end
+  def guess_appendix?(text)
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^附\s*录$/
+    return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
+    text = text.downcase
+    return true if text =~ /^appendix$/
+    return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
+  end
+  def guess_glossary?(text)
+    return false if hav_complete_sentence?(text)
+    return true if text =~ /^术\s*语$/
+    return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+    text = text.downcase
+    return true if text =~ /^glossary$/
+    return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
+  end
+  def guess_digital_section?(text)
+    return false if hav_complete_sentence?(text)
+    matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
+    if matcher
+      return false if matcher[2].length == 0
+      level = matcher[0].split(".").count - 1
+      "sect#{level}".to_sym
+    end
+  end
+  def guess_digital_head_line?(text)
+    return false if hav_complete_sentence?(text)
+    matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
+    if matcher
+      return false if matcher[3].length == 0
+      levels = matcher[1].split(".")
+      return false if levels[0].to_i > 99
+      case levels.count
+      when 1
+        "chapter".to_sym
+      else
+        "sect#{levels.count - 1}".to_sym
+      end
+    end
+  end
+  def guess_head_line?(text)
+    return :volume if guess_volume?(text)
+    return :part if guess_part?(text)
+    return :chapter if guess_chapter?(text)
+    return :section if guess_section?(text)
+    return :preface if guess_preface?(text)
+    return :appendix if guess_appendix?(text)
+    return :index if guess_index?(text)
+    return :glossary if guess_glossary?(text)
+  end
+  def build_struct(content)
+    stack = Array.new(8)
+    struct = []
+    content.each do |line|
+      if line.is_a?(Hash)
+        case type = line[:type].to_sym
+        when :volume
+          7.downto(0) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :part
+          7.downto(1) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :chapter,:appendix,:index,:glossary,:preface,:afterword
+          7.downto(2) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect1
+          if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
+            stack[2][:children] << line[:title]
+          else
+            7.downto(3) do |index|
+              closed_node(struct,stack[0..index])
+              stack[index]=nil
+            end
+            stack[3] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+          end
+        when :sect2
+          7.downto(4) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[4] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect3
+          7.downto(5) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[5] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect4
+          7.downto(6) do |index|
+            closed_node(struct,stack[0..index])
+            stack[index]=nil
+          end
+          stack[6] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        when :sect5
+          closed_node(struct,stack)
+          stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
+        end
+      else
+        if stack[7]
+          stack[7][:children] << line
+        elsif stack[6]
+          stack[6][:children] << line
+        elsif stack[5]
+          stack[5][:children] << line
+        elsif stack[4]
+          stack[4][:children] << line
+        elsif stack[3]
+          stack[3][:children] << line
+        elsif stack[2]
+          stack[2][:children] << line
+        elsif stack[1]
+          stack[1][:children] << line
+        elsif stack[0]
+          stack[0][:children] << line
+        else
+          struct << line
+        end
+      end
+    end
+    7.downto(0) do |index|
+      closed_node(struct,stack[0..index])
+      stack[index] = nil
+    end
+    struct
+  end
+  def closed_node(struct,stack)
+    last = stack.pop
+    if last
+      result = false
+      while stack.any?
+        item = stack.pop
+        if item
+          item[:children] << last
+          result = true
+          break
+        end
+      end
+      if result == false
+        struct << last
+      end
+    end
+  end
+  def hav_complete_sentence?(text)
+    text = text.gsub(/^\d+(\.\d)*\s/,'')
+    text =~ /[\.。!\?！？]/
+  end
+  def extract_toc_from_struct(struct)
+    toc = []
+    struct.each do |item|
+      if item.is_a?(Hash)
+        children = []
+        if item[:children].any?
+          children = extract_toc_from_struct(item[:children])
+        end
+        item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
+        toc << item_hash
+      end
+    end
+    toc
+  end
+  def gen_docbook_toc(toc)
+    "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
+  end
+  def gen_docbook_tocdiv(toc)
+    doc_toc = []
+    toc.each do |item|
+      children = ""
+      if item[:children].any?
+        children = gen_docbook_tocdiv(item[:children])
+      end
+      doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
+    end
+    doc_toc.join("")
+  end
+  def gen_docbook_content(struct)
+    content = []
+    struct.each do |item|
+      if item.is_a?(Hash)
+        children = ""
+        if item[:children].any?
+          children = gen_docbook_content(item[:children])
+        end
+        case item[:type]
+        when 'volume','part'
+          content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
+        when 'chapter','appendix','glossary','index','preface'
+          content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
+        when 'sect1','sect2','sect3','sect4','sect5'
+          content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
+        end
+      else
+        text = escape_html(clean_text(item))
+        if text.length > 0
+          content << "<para id='#{UUID.generate}'>#{text}</para>"
+        end
+      end
+    end
+    content.join("\n")
+  end
+  def to_utf8(text,encoding='GB2312')
+    doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
+    doc.join("")
+    #text.encode(encoding)
+  rescue
+    text
+  end
+  def detect_utf8(content)
+    content.each_line{|line| line.strip}
+    true
+  rescue
+    false
+  end
+  # sanitize_for_epub_text
+  def sanitize_for_epub_text(content)
+    lines = []
+    content.each_line do |line|
+      unless line.downcase.include?('document outline')
+        lines << line
+      else
+        break;
+      end
+    end
+    lines.join("")
+  end
+  # clean_text
+  #  获得干净的文本，去除两边的空格和回车
+  def clean_text(text)
+    return text if text.nil?
+    text = text.strip
+    text.gsub("\n",'')
+  end
+  # escape_html
+  # 文本转义，在txt文本转html时需要使用
+  def escape_html(text)
+    CGI::escapeHTML(text)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,86 @@
+--- !ruby/object:Gem::Specification
+name: extract_book_struct
+version: !ruby/object:Gem::Version
+  version: 0.0.3
+  prerelease:
+platform: ruby
+authors:
+- Aaron
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-03-29 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: uuid
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: iconv
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: 书结构信息提取工具.
+email: aaron@nonobo.com
+executables:
+- extract_book_struct
+- batch_extract_book_struct
+extensions: []
+extra_rdoc_files: []
+files:
+- README
+- CHANGELOG
+- bin/extract_book_struct
+- bin/batch_extract_book_struct
+- lib/extract_book_struct.rb
+- lib/batch_extract.rb
+homepage:
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements:
+- none
+rubyforge_project:
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: 书结构信息提取工具.
+test_files: []