RubyGems - ebook_tools - Versions diffs - 0.0.1 - Mend

ebook_tools 0.0.1

Files changed (12) hide show

data/CHANGELOG ADDED

	@@ -0,0 +1,2 @@
1	+ 0.0.1 2013.4.1
2	+ init release

data/README ADDED

@@ -0,0 +1,76 @@
+# encoding: UTF-8
+# = ExtractBookStruct
+# ExtractBookStruct的目的是从各类电子书内容中提取书的结构信息。目前支持txt,epub,html。
+# ExtractBookStruct选择从TXT文档提取书的结构信息。对TXT的文档要有如下要求:
+#   1. 文档的编码格式必须是UTF-8或GB2312，推荐使用UTF-8格式
+#   2. 文档的内容只包含书内容部分（书名、作者、目录等信息应该不包含在文档内）
+#   3. 文档的段落应该完整（有些PDF转换过来的文档会破坏句子，需要进行预处理）
+#   4. 文档必须符合正常的文档流（错位的章节段落等情况将影响正常的结构提取）
+#   5. 文档需要包含结构信息（例如： 卷、篇、部分、章（回）节或者有连续的序号）
+#   6. 每个结构信息都应该独立成行。
+#
+# 文档结构信息分析
+#   一本书在编排的时候会有自己的结构信息，这些结构信息通常通过卷、篇、部分、章(回)节等表述，也会使用序号的方式表述。总体上可以分为以下几种：
+#  1. 文本描述(text)： 按卷、部分(篇)、章（回）、节等文字表述
+#  2. 数字描述(digital)： 所有结构信息都是按照数字序号表示，比如 1 xxxxx; 1.1 xxxxx
+#  3. 混合描述(hybrid)：章按照文字表述，节按照序号表示，比如 1.1 xxxxxx
+#   根据不同的类型，对结构信息的提取采用不同的处理手段。
+#
+# 有效的标题信息应该符合以下规则:
+#  1. 标题应该不包含完整的句子（应该不包含句子分隔符，例如“。","!"等）
+#  2. 应该包含结构信息表述，具体如下：
+#    文本描述:
+#     卷:  以"第xxx卷"开始
+#          以"卷"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          以"volume"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     部分（篇）: 以"第xxx部"或"第xxx篇"开始
+#               以"part"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     章(回）: 以"第xxx章"或"第xxx回"开始
+#              以"chapter"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     节:  以"第xxx节"开始
+#     前言: 以"前"开始，以"言"结束，中间加入空白字符。例如"前言"，"前  言"等。
+#           以"序"开始，以"言"结束，中间加入空白字符。例如"序言"，"序  言"等。
+#           单个"序"
+#           以"序"或"序言"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "preface"
+#          "foreword"
+#           以"preface"或"foreword"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     索引: 以"索"开始，以"引"结束，中间加入空白字符。例如"索引"，"索  引"等。
+#           以"索引"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "index"
+#           以"index"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     附录: 以"附"开始，以"录"结束，中间加入空白字符。例如"附录"，"附  录"等。
+#           以"附录"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "appendix"
+#           以"appendix"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#     术语: 以"术"开始，以"语"结束，中间加入空白字符。例如"术语"，"术  语"等。
+#           以"术语"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#          "glossary"
+#           以"glossary"开始，后面跟序号表述方式，例如 “I”，“Ⅱ”，“1”等
+#
+#    数字描述:
+#      以数字序号层级表达，数字序号和标题内容之间有空白字符分隔。例如"1 管理的概念", "1.1 定义", "1.1.1 管理"等。
+#
+# ==API接口
+#
+# === ExtractBookStruct.from_txt
+#  从文本文件中提取目录结构，使用示例:
+#     ExtractBookStruct.from_txt('1.txt',{:title=>'title',:author=>'author'})
+#
+# === ExtractBookStruct.from_epub
+#  从EPUB文件中提取目录结构，使用示例:
+#      ExtractBookStruct.from_epub('1.epub',{:title=>'title',:author=>'author'})
+#
+# === ExtractBookStruct.from_html
+#  从HTML中提取目录结构,使用示例:
+#      ExtractBookStruct.from_html('1.html',{:title=>'title',:author=>'author'})
+#
+# == 命令行工具
+#   extract_book_struct，使用示例:
+#      extract_book_struct '1.txt', '1.xml'
+#
+# == 依赖
+#   ExtractBookStruct依赖以下工具和包:
+#     ebook-convert: calibre cli tools.
+#     uuid: ruby gem.
+#     iconv: ruby gem.

data/bin/ebook_tools ADDED

@@ -0,0 +1,196 @@
+#!/usr/bin/env ruby
+# encoding: UTF-8
+require 'rubygems'
+require 'optparse'
+require File.join(File.expand_path('../../',__FILE__),'lib','ebook_tools')
+def help(command=nil)
+  case command
+  when :convert
+    puts <<-EOF
+    usage:
+      ebook_tools convert [options] source destination
+    source: 源文件
+    destination: 输出的文件
+    options:
+      -K <keywords>, --keywords <keywords> : 当需要给epub打关键词时使用该参数。
+      -F, --fix   : 当需要自动修复异常中断的句子时使用该参数
+      --length : 每行的长度，当需要自动修复异常中断的句子时使用该参数。
+      -H <row_count>, --header <row_count> : 仅对pdf文件有效，当需要指定页眉行数时使用该参数。
+      --footer <row_count> : 仅对pdf文件有效，当需要指定页脚行数时使用该参数。
+    EOF
+  when :batch_convert
+    puts <<-EOF
+    usage:
+      ebook_tools batch_convert [options] source destination
+    source: 源文件所在目录
+    destination: 输出的目标目录
+    options:
+      -K <keywords>, --keywords <keywords> : 当需要给epub打关键词时使用该参数。
+    -F, --fix   : 当需要自动修复异常中断的句子时使用该参数
+    -H <row_count>, --header <row_count> : 仅对pdf文件有效，当需要指定页眉行数时使用该参数。
+    --footer <row_count> : 仅对pdf文件有效，当需要指定页脚行数时使用该参数。
+    EOF
+  when :extract
+    puts <<-EOF
+    usage:
+      ebook_tools extract [options] source destination
+    source： 指定需要提取结构信息的书文件
+    destination: 指定提取的书结构信息所输出的文件
+    options:
+      -T <title>, --title <title> : 书的标题
+    -A <author>, --author <author> : 书作者
+    --pubdate <pubdate> : 出版时间
+    --publisher <publisher> : 出版社
+    EOF
+  when :batch_extract
+    puts <<-EOF
+    usage:
+      ebook_tools batch_extract source destination
+    source: 源文件所在目录
+    destination: 输出的目标目录
+    EOF
+  when :paras_repair
+    puts <<-EOF
+    usage:
+      ebook_tools paras_repair [options] source destination
+    source: 指定需要修复段落的源文件，必须是文本文件
+    destination: 指定修复后输出的文件
+    options:
+      -l <length>, --length <length> : 指定异常段落被截断的最小长度。
+    EOF
+  else
+    puts <<-EOF
+    ebook_tools: ebook处理工具集，包括格式转换，结构提取等。
+    usage:
+      ebook_tools command [options] source destination
+    command:
+      convert:  从source文件格式转换成epub格式，source文件支持txt,html,epub，pdf格式
+      extract: 从source文件中提取书结构信息
+      paras_repair: 对文本文件进行段落修复
+      batch_convert: 批量转换指定目录中的文件为epub格式文件，并存放到目标目录
+      batch_extract: 批量提取指定目录中文件的书结构信息，并生成Docbook存放到目标目录
+    适用对象要求:
+      编码格式为utf-8
+    具体命令的更多信息请通过'ebook_tools help <command>'查看。
+    EOF
+  end
+  exit
+end
+def extract_argv(argv)
+  argv = argv.dup
+  command = argv.shift
+  source = argv[-2]
+  destination = argv[-1]
+  [command,source,destination,argv]
+end
+command,source,destination,opt_args = extract_argv(ARGV)
+options = {}
+opts = OptionParser.new do |opts|
+  opts.on('-F','--fix') do |fix|
+    options[:fix] = fix
+  end
+  opts.on('-H row_count','--header row_count') do |row_count|
+    options[:header_rows_count] = row_count.to_i
+  end
+  opts.on('--footer row_count') do |row_count|
+    options[:footer_rows_count] = row_count.to_i
+  end
+  opts.on('-K keywords','--keywords keywords') do |keywords|
+    options[:keywords] = keywords
+  end
+  opts.on('-L length','--length length') do |length|
+    options[:length] = length.to_i
+  end
+  opts.on('-T title','--title title','title') do |title|
+    options[:title] = title
+  end
+  opts.on('-A author','--author author','author') do |author|
+    options[:author] = author
+  end
+  opts.on('--publisher publisher','publisher') do |publisher|
+    options[:publisher] = publisher
+  end
+  opts.on('--pubdate pubdate','pubdate') do |pubdate|
+    options[:pubdate] = pubdate
+  end
+  opts.on('-h','--help') do
+    help
+  end
+end
+opts.parse opt_args
+command = command.to_sym if command
+if source.nil? || destination.nil?
+  help(command)
+end
+unless Utils.source_exists?(source)
+  puts "error: source #{source} no found"
+  exit
+end
+begin
+  Utils.make_destination_dir(destination)
+rescue
+  puts "error: destination #{destination} not created"
+  exit
+end
+begin
+  case command
+  when :convert
+    if EbookTools.convert(source,destination,options)
+      puts "success: #{source} conversion successfully!"
+    else
+      puts "error: 只允许转换txt，html,pdf,epub格式"
+    end
+  when :batch_convert
+    EbookTools.batch_convert(source,destination,options)
+  when :extract
+    if EbookTools.allow_extract_struct?(source)
+      if EbookTools.extract_book_struct_to_file(source,destination,options)
+        puts "success: extract book struct  successfully!"
+      else
+        puts "警告: 没有检测到书结构信息."
+      end
+    else
+      puts "error: #{source}不是允许的文件格式: txt,html,epub"
+    end
+  when :batch_extract
+    EbookTools.batch_extract_from_dir(source,destination,options)
+  when :paras_repair
+    EbookTools.text_paras_repair(source,destination,options)
+    puts "success: #{source} repair successfully!"
+  else
+    help
+  end
+rescue => e
+  puts "error: #{source} \n#{e.backtrace.join("\n")}"
+end

data/ebook_tools.gemspec ADDED

@@ -0,0 +1,38 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{ebook_tools}
+  s.version = '0.0.1'
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Aaron"]
+  s.date = %q{2013-04-01}
+  s.description = %q{电子书工具集.}
+  s.email = %q{aaron@nonobo.com}
+  s.require_paths = ["lib"]
+  s.requirements = ["none"]
+  s.summary = %q{电子书工具集.}
+  s.has_rdoc = true
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.executables << "ebook_tools"
+  s.files = [
+    "README",
+    "CHANGELOG",
+    "bin/ebook_tools",
+    "lib/ebook_tools.rb",
+    "lib/extract_book_struct.rb",
+    "lib/header_detect.rb",
+    "lib/pdf.rb",
+    "lib/txt.rb",
+    "lib/epub.rb",
+    "lib/utils.rb",
+    "ebook_tools.gemspec"
+  ]
+  s.add_dependency(%q<uuid>)
+  s.add_dependency(%q<iconv>)
+  s.add_dependency(%q<gepub>)
+  s.add_dependency(%q<poppler>)
+  s.add_dependency(%q<pdf-reader>)
+  s.add_dependency(%q<nokogiri>)
+  s.add_dependency(%q<levenshtein>)
+end

data/lib/ebook_tools.rb ADDED

@@ -0,0 +1,248 @@
+#!/usr/bin/env ruby
+# encoding: UTF-8
+['utils','epub','txt','pdf','header_detect','extract_book_struct'].each do |file|
+  require File.join(File.dirname(__FILE__),file)
+end
+module EbookTools
+  extend self
+  def convert(filename,epub_file,options={})
+    method_name = "#{File.extname(filename).gsub('.','')}2epub"
+    if EbookTools.respond_to?(method_name)
+      EbookTools.send(method_name,filename,epub_file,options)
+      return true
+    else
+      return nil
+    end
+  end
+  # txt2epub
+  # 将文本格式转换成EPUB格式
+  def txt2epub(filename,epub_file,options={})
+    basename = File.basename(filename,'.txt')
+    temp_dir = "#{basename}"
+    FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir)
+    title,outlines, content = TXT.extract_book_part(filename)
+    if options[:fix]
+      content = Utils.fixed_page_break(content)
+    end
+    html_content = TXT.gen_html_from_txt_book(title,outlines,content)
+    html = Utils.wrapper_html(html_content)
+    html_file = File.join([temp_dir,"#{basename}.html"].compact)
+    Utils.write_file(html,html_file)
+    sections = Utils.detect_sections_from_html(html_file)
+    nav_file = EPUB.gen_nav_file(html_file,sections)
+    EPUB.write_epub(epub_file,options.merge(:files=>[nav_file,html_file]))
+    ensure
+      FileUtils.remove_dir(temp_dir,true)
+  end
+  # html2epub
+  # 将HTML格式转换成EPUB格式
+  def html2epub(filename,epub_file,options={})
+    basename = File.basename(filename,'.html')
+    temp_dir = "#{basename}"
+    FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir)
+    html = File.open(filename).read
+    html_file = File.join([temp_dir,"#{basename}.html"].compact)
+    Utils.write_file(html,html_file)
+    sections = Utils.detect_sections_from_html(html_file)
+    nav_file = EPUB.gen_nav_file(html_file,sections)
+    EPUB.write_epub(epub_file,options.merge(:files=>[nav_file,html_file]))
+    ensure
+      FileUtils.remove_dir(temp_dir,true)
+  end
+  # pdf2epub
+  # 将PDF格式转换成EPUB格式
+  def pdf2epub(filename,epub_file,options={})
+    basename = File.basename(filename,'.pdf')
+    temp_dir = "#{basename}"
+    FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir)
+    pages_text = PDF.extract_pdf_pages_text(filename)
+    pages_text = PDF.sanitize_page_header_and_footer(pages_text,options)
+    pages_text = PDF.fixed_break_with_pages_text(pages_text)
+    sections = PDF.extract_sections(filename)
+    illustrations = PDF.extract_illustrations(filename,{:dir=>temp_dir})
+    html_content = PDF.gen_html_from_sections_and_page_texts(sections,pages_text,illustrations)
+    html = Utils.wrapper_html(html_content)
+    html_file = File.join([temp_dir,"#{basename}.html"].compact)
+    Utils.write_file(html,html_file)
+    illustrations_path = illustrations.map{|image_path| File.join(temp_dir,image_path)}
+    nav_file = EPUB.gen_nav_file(html_file,sections)
+    files = [html_file,nav_file,illustrations_path].flatten
+    meta = PDF.extract_pdf_meta(filename)
+    epub_options = options.merge(meta).merge(:files=>files)
+    EPUB.write_epub(epub_file,epub_options)
+    ensure
+      FileUtils.remove_dir(temp_dir,true)
+  end
+  def batch_convert(source,destination,options={})
+    log = File.open('batch.log','a')
+    success_log = File.open('success.log','a')
+    error_log = File.open('error.log','a')
+    scan_log = File.open('scan.log','a')
+    unknown_log = File.open('unknown.log','a')
+    source_path = File.absolute_path(source)
+    dest_path = File.join(File.absolute_path(destination),'epub')
+    scan_path = File.join(File.absolute_path(destination),'scan')
+    unknown_path = File.join(File.absolute_path(destination),'unknown')
+    backup_path = File.join(File.absolute_path(destination),'backup')
+    format = options[:format]
+    files = Utils.scan_file_from_dir(source_path,:format=>format)
+    total_count = files.count
+    scan_count = 0
+    success_count = 0
+    error_count = 0
+    unknown_count = 0
+    puts "count: #{total_count} file "
+    log.puts "****batch convert****** : #{Time.now}"
+    log.puts "#{source_path}  =>  #{dest_path} "
+    log.puts "count: #{total_count} file "
+    success_log.puts "****batch convert****** : #{Time.now}"
+    success_log.puts "#{source_path}  =>  #{dest_path} "
+    error_log.puts "****batch convert****** : #{Time.now}"
+    error_log.puts "#{source_path}  =>  #{dest_path} "
+    scan_log.puts "****batch convert****** : #{Time.now}"
+    scan_log.puts "#{source_path}  =>  #{dest_path} "
+    unknown_log.puts "****batch convert****** : #{Time.now}"
+    unknown_log.puts "#{source_path}  =>  #{dest_path} "
+    files.each do |file|
+      dest_file = File.join(File.dirname(File.join(dest_path,file.gsub(source_path,''))),"#{File.basename(file,File.extname(file))}.epub")
+      keywords = Utils.extract_keywords_from_path(File.dirname(file).gsub(source_path,''))
+      puts "start convert #{file}"
+      method_name = "#{File.extname(file).gsub('.','')}2epub"
+      if EbookTools.respond_to?(method_name)
+        begin
+          if PDF.scan_pdf?(file)
+            scan_file = File.join(scan_path,file.gsub(source_path,''))
+            FileUtils.mkdir_p(File.dirname(scan_file)) unless Dir.exists?(File.dirname(scan_file))
+            FileUtils.mv(file,scan_file,:force=>true)
+            scan_count += 1
+            scan_log.puts "warning: #{file} is scan pdf."
+          else
+            EbookTools.send(method_name,file,dest_file,{:keywords=>keywords})
+            success_file = File.join(backup_path,file.gsub(source_path,''))
+            FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file))
+            FileUtils.mv(file,success_file,:force=>true)
+            success_count += 1
+            success_log.puts "success: #{source} conversion successfully!"
+          end
+        rescue Exception => e
+          unknown_file = File.join(unknown_path,file.gsub(source_path,''))
+          FileUtils.mkdir_p(File.dirname(unknown_file)) unless Dir.exists?(File.dirname(unknown_file))
+          FileUtils.mv(file,unknown_file,:force=>true)
+          error_count += 1
+          error_log.puts "error: #{source} \n#{e.backtrace.join("\n")}"
+        end
+      end
+    end
+    success_log.puts "count: #{success_count}   Time: #{Time.now} \n"
+    scan_log.puts "count: #{scan_count}    Time: #{Time.now} \n"
+    error_log.puts "count: #{error_count}    Time: #{Time.now} \n"
+    unknown_log.puts "unknown: #{unknown_count}    Time: #{Time.now} \n"
+    log.puts "success: #{success_count}   scan: #{scan_count}   error: #{error_count}    Time: #{Time.now} \n"
+    ensure
+      success_log.close
+      error_log.close
+      scan_log.close
+      unknown_log.close
+      log.close
+  end
+  def allow_extract_struct?(file)
+    extname = File.extname(file)
+    ['.txt','.html','.epub'].include?(extname.downcase)
+  end
+  def extract_book_struct_to_file(source,destination,options={})
+    method_name = "from_#{File.extname(source).gsub('.','')}"
+    if ExtractBookStruct.respond_to?(method_name)
+      docbook_xml = ExtractBookStruct.send(method_name,source,options)
+      if docbook_xml
+        File.open(destination,'wb'){|file|file.write docbook_xml}
+        return true
+      else
+        return nil
+      end
+    end
+  end
+  # batch_extract_from_dir
+  #  batch extract book struct form dir
+  # parameters:
+  #   +source+     source directory
+  #   +destination+   output directory
+  #   +options+        optional parameter.
+  #      :format     指定需要提取结构的文件后缀名，例如要从所有txt文件中提取，通过:format=>'.txt'指定
+  def batch_extract_from_dir(source,destination,options={})
+    format = options.delete(:format)
+    files = Utils.scan_file_from_dir(source,{:format=>format})
+    files.each do |file|
+      extname = File.extname(file)
+      basename = File.basename(file,extname)
+      dest_file = File.join(File.dirname(File.join(destination,file.gsub(source,''))),"#{basename}.xml")
+      if allow_extract_struct?(file)
+        puts "start extract #{file} ..."
+        begin
+          if extract_book_struct_to_file(file,dest_file)
+            puts "success: extract book struct  successfully!"
+          else
+            puts "警告: 没有检测到书结构信息."
+          end
+        rescue Exception => e
+          puts "error: #{file} \n#{e.backtrace.join("\n")}"
+        end
+      else
+        puts "error: #{file}不是允许的文件格式: txt,html,epub"
+      end
+    end
+  end
+  # text_paras_repair
+  # 对文本文件格式中的中断段落进行修复
+  def text_paras_repair(source_file,target_file,options={})
+    content = File.open(source_file).read
+    content = Utils.to_utf8 unless Utils.detect_utf8(content)
+    content = Utils.fixed_page_break(content,options)
+    File.open(target_file,'w'){|file| file.write content}
+  end
+end