ebook_tools 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ebook_tools.gemspec +10 -9
 - data/lib/ebook_tools.rb +32 -35
 - data/lib/epub_book.rb +164 -0
 - data/lib/paras_in_mongo.rb +1 -0
 - data/lib/txt_book.rb +38 -18
 - data/lib/utils.rb +15 -0
 - metadata +8 -7
 
    
        data/ebook_tools.gemspec
    CHANGED
    
    | 
         @@ -2,13 +2,13 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            Gem::Specification.new do |s|
         
     | 
| 
       4 
4 
     | 
    
         
             
              s.name = %q{ebook_tools}
         
     | 
| 
       5 
     | 
    
         
            -
              s.version = '0.1. 
     | 
| 
      
 5 
     | 
    
         
            +
              s.version = '0.1.5'
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         
     | 
| 
       8 
8 
     | 
    
         
             
              s.authors = ["Aaron"]
         
     | 
| 
       9 
     | 
    
         
            -
              s.date = %q{2013-04 
     | 
| 
      
 9 
     | 
    
         
            +
              s.date = %q{2013-06-04}
         
     | 
| 
       10 
10 
     | 
    
         
             
              s.description = %q{电子书工具集.}
         
     | 
| 
       11 
     | 
    
         
            -
              s.email = %q{ 
     | 
| 
      
 11 
     | 
    
         
            +
              s.email = %q{yalong1976@gmail.com}
         
     | 
| 
       12 
12 
     | 
    
         
             
              s.require_paths = ["lib"]
         
     | 
| 
       13 
13 
     | 
    
         
             
              s.requirements = ["none"]
         
     | 
| 
       14 
14 
     | 
    
         
             
              s.summary = %q{电子书工具集.}
         
     | 
| 
         @@ -25,15 +25,16 @@ Gem::Specification.new do |s| 
     | 
|
| 
       25 
25 
     | 
    
         
             
                "bin/doc_book_import_mongo",
         
     | 
| 
       26 
26 
     | 
    
         
             
                "bin/para_import_scheduling",
         
     | 
| 
       27 
27 
     | 
    
         
             
                "bin/xml2json",
         
     | 
| 
      
 28 
     | 
    
         
            +
                "lib/doc_book_in_mongo.rb",    
         
     | 
| 
       28 
29 
     | 
    
         
             
                "lib/ebook_tools.rb",
         
     | 
| 
       29 
     | 
    
         
            -
                "lib/txt_book.rb",
         
     | 
| 
       30 
     | 
    
         
            -
                "lib/header_detect.rb",
         
     | 
| 
       31 
     | 
    
         
            -
                "lib/pdf.rb",
         
     | 
| 
       32 
     | 
    
         
            -
                "lib/txt.rb",
         
     | 
| 
       33 
30 
     | 
    
         
             
                "lib/epub.rb",
         
     | 
| 
      
 31 
     | 
    
         
            +
                "lib/epub_book.rb",
         
     | 
| 
      
 32 
     | 
    
         
            +
                "lib/header_detect.rb",    
         
     | 
| 
      
 33 
     | 
    
         
            +
                "lib/paras_in_mongo.rb",    
         
     | 
| 
      
 34 
     | 
    
         
            +
                "lib/pdf.rb",    
         
     | 
| 
      
 35 
     | 
    
         
            +
                "lib/txt.rb",    
         
     | 
| 
      
 36 
     | 
    
         
            +
                "lib/txt_book.rb",
         
     | 
| 
       34 
37 
     | 
    
         
             
                "lib/utils.rb",
         
     | 
| 
       35 
     | 
    
         
            -
                "lib/paras_in_mongo.rb",
         
     | 
| 
       36 
     | 
    
         
            -
                "lib/doc_book_in_mongo.rb",
         
     | 
| 
       37 
38 
     | 
    
         
             
                "workers/para_import_worker.rb",
         
     | 
| 
       38 
39 
     | 
    
         
             
                "ebook_tools.gemspec"
         
     | 
| 
       39 
40 
     | 
    
         
             
              ]
         
     | 
    
        data/lib/ebook_tools.rb
    CHANGED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            #!/usr/bin/env ruby
         
     | 
| 
       2 
2 
     | 
    
         
             
            # encoding: UTF-8
         
     | 
| 
       3 
     | 
    
         
            -
            ['utils','epub','txt','pdf','header_detect','txt_book'].each do |file|
         
     | 
| 
      
 3 
     | 
    
         
            +
            ['utils','epub','txt','pdf','header_detect','txt_book','epub_book'].each do |file|
         
     | 
| 
       4 
4 
     | 
    
         
             
              require File.join(File.dirname(__FILE__),file)
         
     | 
| 
       5 
5 
     | 
    
         
             
            end
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
         @@ -202,27 +202,34 @@ module EbookTools 
     | 
|
| 
       202 
202 
     | 
    
         | 
| 
       203 
203 
     | 
    
         
             
              def extract_book_struct_to_file(source,destination,options={})
         
     | 
| 
       204 
204 
     | 
    
         
             
                options[:title] ||= File.basename(source,File.extname(source))
         
     | 
| 
       205 
     | 
    
         
            -
                 
     | 
| 
       206 
     | 
    
         
            -
                   
     | 
| 
       207 
     | 
    
         
            -
             
     | 
| 
       208 
     | 
    
         
            -
                   
     | 
| 
       209 
     | 
    
         
            -
                     
     | 
| 
       210 
     | 
    
         
            -
                     
     | 
| 
       211 
     | 
    
         
            -
             
     | 
| 
       212 
     | 
    
         
            -
                     
     | 
| 
       213 
     | 
    
         
            -
             
     | 
| 
       214 
     | 
    
         
            -
                txt_book = TxtBook.new(content,options)
         
     | 
| 
       215 
     | 
    
         
            -
                docbook_xml = txt_book.to_doc_book
         
     | 
| 
       216 
     | 
    
         
            -
                if docbook_xml
         
     | 
| 
       217 
     | 
    
         
            -
                  FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
         
     | 
| 
       218 
     | 
    
         
            -
                  File.open(destination,'wb'){|file|file.write docbook_xml}
         
     | 
| 
       219 
     | 
    
         
            -
                  puts "目录结构:"
         
     | 
| 
       220 
     | 
    
         
            -
                  puts txt_book.toc_to_text
         
     | 
| 
       221 
     | 
    
         
            -
                  puts "共修复#{txt_book.breaklines_count}个断点."
         
     | 
| 
       222 
     | 
    
         
            -
                  return true
         
     | 
| 
      
 205 
     | 
    
         
            +
                if File.extname(source) == '.epub'
         
     | 
| 
      
 206 
     | 
    
         
            +
                  epub_book = EpubBook.new(source,options)
         
     | 
| 
      
 207 
     | 
    
         
            +
                  docbook_xml = epub_book.to_doc_book
         
     | 
| 
      
 208 
     | 
    
         
            +
                  if docbook_xml
         
     | 
| 
      
 209 
     | 
    
         
            +
                    write_doc_book(destination,docbook_xml)
         
     | 
| 
      
 210 
     | 
    
         
            +
                    puts "目录结构:"
         
     | 
| 
      
 211 
     | 
    
         
            +
                    puts epub_book.toc_to_text
         
     | 
| 
      
 212 
     | 
    
         
            +
                    return true
         
     | 
| 
      
 213 
     | 
    
         
            +
                  end
         
     | 
| 
       223 
214 
     | 
    
         
             
                else
         
     | 
| 
       224 
     | 
    
         
            -
                   
     | 
| 
      
 215 
     | 
    
         
            +
                  content = case File.extname(source)
         
     | 
| 
      
 216 
     | 
    
         
            +
                    when '.html'
         
     | 
| 
      
 217 
     | 
    
         
            +
                      Utils.extract_text_from_file(source,'.html')
         
     | 
| 
      
 218 
     | 
    
         
            +
                    when '.txt'
         
     | 
| 
      
 219 
     | 
    
         
            +
                      File.open(source).read
         
     | 
| 
      
 220 
     | 
    
         
            +
                  end
         
     | 
| 
      
 221 
     | 
    
         
            +
                  txt_book = TxtBook.new(content,options)
         
     | 
| 
      
 222 
     | 
    
         
            +
                  docbook_xml = txt_book.to_doc_book
         
     | 
| 
      
 223 
     | 
    
         
            +
                  if docbook_xml
         
     | 
| 
      
 224 
     | 
    
         
            +
                    write_doc_book(destination,docbook_xml)
         
     | 
| 
      
 225 
     | 
    
         
            +
                    puts "目录结构:"
         
     | 
| 
      
 226 
     | 
    
         
            +
                    puts txt_book.toc_to_text
         
     | 
| 
      
 227 
     | 
    
         
            +
                    puts "共修复#{txt_book.breaklines_count}个断点."
         
     | 
| 
      
 228 
     | 
    
         
            +
                    return true
         
     | 
| 
      
 229 
     | 
    
         
            +
                  end
         
     | 
| 
       225 
230 
     | 
    
         
             
                end
         
     | 
| 
      
 231 
     | 
    
         
            +
             
     | 
| 
      
 232 
     | 
    
         
            +
                return nil
         
     | 
| 
       226 
233 
     | 
    
         
             
              end
         
     | 
| 
       227 
234 
     | 
    
         | 
| 
       228 
235 
     | 
    
         
             
              # batch_extract_from_dir
         
     | 
| 
         @@ -262,6 +269,11 @@ module EbookTools 
     | 
|
| 
       262 
269 
     | 
    
         
             
                end
         
     | 
| 
       263 
270 
     | 
    
         
             
              end 
         
     | 
| 
       264 
271 
     | 
    
         | 
| 
      
 272 
     | 
    
         
            +
              def write_doc_book(destination, docbook_xml)
         
     | 
| 
      
 273 
     | 
    
         
            +
                FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
         
     | 
| 
      
 274 
     | 
    
         
            +
                File.open(destination,'wb'){|file|file.write docbook_xml}    
         
     | 
| 
      
 275 
     | 
    
         
            +
              end
         
     | 
| 
      
 276 
     | 
    
         
            +
             
     | 
| 
       265 
277 
     | 
    
         
             
              # text_paras_repair
         
     | 
| 
       266 
278 
     | 
    
         
             
              # 对文本文件格式中的中断段落进行修复
         
     | 
| 
       267 
279 
     | 
    
         
             
              def text_paras_repair(source_file,target_file,options={})
         
     | 
| 
         @@ -271,21 +283,6 @@ module EbookTools 
     | 
|
| 
       271 
283 
     | 
    
         
             
                File.open(target_file,'w'){|file| file.write content}
         
     | 
| 
       272 
284 
     | 
    
         
             
              end
         
     | 
| 
       273 
285 
     | 
    
         | 
| 
       274 
     | 
    
         
            -
              def extract_text_from_file(filename,format)
         
     | 
| 
       275 
     | 
    
         
            -
                txt_file = File.basename(filename,format)
         
     | 
| 
       276 
     | 
    
         
            -
                if !filename.include?("'")
         
     | 
| 
       277 
     | 
    
         
            -
                  cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
         
     | 
| 
       278 
     | 
    
         
            -
                elsif !filename.include?('"')
         
     | 
| 
       279 
     | 
    
         
            -
                  cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
         
     | 
| 
       280 
     | 
    
         
            -
                else
         
     | 
| 
       281 
     | 
    
         
            -
                  cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
         
     | 
| 
       282 
     | 
    
         
            -
                end
         
     | 
| 
       283 
     | 
    
         
            -
                output = `#{cmd}`
         
     | 
| 
       284 
     | 
    
         
            -
                content = File.open("#{txt_file}.txt").read
         
     | 
| 
       285 
     | 
    
         
            -
                FileUtils.remove_file("#{txt_file}.txt",true)
         
     | 
| 
       286 
     | 
    
         
            -
                return content
         
     | 
| 
       287 
     | 
    
         
            -
              end
         
     | 
| 
       288 
     | 
    
         
            -
             
     | 
| 
       289 
286 
     | 
    
         
             
              # sanitize_for_epub_text
         
     | 
| 
       290 
287 
     | 
    
         
             
              def sanitize_for_epub_text(content)
         
     | 
| 
       291 
288 
     | 
    
         
             
                return content if content.blank?
         
     | 
    
        data/lib/epub_book.rb
    ADDED
    
    | 
         @@ -0,0 +1,164 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'uuid'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'cgi'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            # epub_book
         
     | 
| 
      
 6 
     | 
    
         
            +
            #  处理EPUB书的类。
         
     | 
| 
      
 7 
     | 
    
         
            +
            class EpubBook
         
     | 
| 
      
 8 
     | 
    
         
            +
              attr_reader :title,:author,:publisher,:pubdate,:isbn,:content,:outline
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
              def initialize(filename,options={})
         
     | 
| 
      
 11 
     | 
    
         
            +
                raise '无效的文件' unless File.exists?(filename)
         
     | 
| 
      
 12 
     | 
    
         
            +
                @title = options[:title]
         
     | 
| 
      
 13 
     | 
    
         
            +
                @author = options[:author]
         
     | 
| 
      
 14 
     | 
    
         
            +
                @publisher = options[:publisher]
         
     | 
| 
      
 15 
     | 
    
         
            +
                @pubdate= options[:pubdate]
         
     | 
| 
      
 16 
     | 
    
         
            +
                @isbn = options[:isbn]
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                text = Utils.extract_text_from_file(filename,'.epub')
         
     | 
| 
      
 19 
     | 
    
         
            +
                unless  Utils.detect_utf8(text)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  text = Utils.to_utf8(text)
         
     | 
| 
      
 21 
     | 
    
         
            +
                end
         
     | 
| 
      
 22 
     | 
    
         
            +
                text = preprocess_content(text)
         
     | 
| 
      
 23 
     | 
    
         
            +
                @outline, @content = extract_for_epub_text(text)
         
     | 
| 
      
 24 
     | 
    
         
            +
              end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
              def toc_to_text
         
     | 
| 
      
 27 
     | 
    
         
            +
                outline
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
              def to_doc_book
         
     | 
| 
      
 31 
     | 
    
         
            +
                build_doc_book(@outline,@content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
         
     | 
| 
      
 32 
     | 
    
         
            +
              end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
              private
         
     | 
| 
      
 36 
     | 
    
         
            +
                def preprocess_content(content)
         
     | 
| 
      
 37 
     | 
    
         
            +
                  paras = extract_paras(content)
         
     | 
| 
      
 38 
     | 
    
         
            +
                  paras.join("\n")
         
     | 
| 
      
 39 
     | 
    
         
            +
                end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
                def extract_paras(content)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  paras = []
         
     | 
| 
      
 43 
     | 
    
         
            +
                  return paras if content.blank?
         
     | 
| 
      
 44 
     | 
    
         
            +
                  content.each_line do |line|
         
     | 
| 
      
 45 
     | 
    
         
            +
                    text = Utils.clean_text(line)
         
     | 
| 
      
 46 
     | 
    
         
            +
                    paras << text if text.length > 0
         
     | 
| 
      
 47 
     | 
    
         
            +
                  end
         
     | 
| 
      
 48 
     | 
    
         
            +
                  paras
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                def build_doc_book(outline,content,options={})
         
     | 
| 
      
 52 
     | 
    
         
            +
                  doc_toc = gen_docbook_toc(outline.split("\n"))
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                  doc_content = gen_docbook_content(content)
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
            <<-EOS
         
     | 
| 
      
 57 
     | 
    
         
            +
            <?xml version="1.0" encoding="utf-8"?>
         
     | 
| 
      
 58 
     | 
    
         
            +
                  <book xmlns="http://docbook.org/ns/docbook" version="5.0" id="#{UUID.generate}">
         
     | 
| 
      
 59 
     | 
    
         
            +
                  <info>
         
     | 
| 
      
 60 
     | 
    
         
            +
                  <title>#{options[:title]}</title>
         
     | 
| 
      
 61 
     | 
    
         
            +
                  <authorgroup>
         
     | 
| 
      
 62 
     | 
    
         
            +
                  <author><personname>#{options[:author]}</personname></author>
         
     | 
| 
      
 63 
     | 
    
         
            +
                  </authorgroup>
         
     | 
| 
      
 64 
     | 
    
         
            +
                  <pubdate>#{options[:pubdate]}</pubdate>
         
     | 
| 
      
 65 
     | 
    
         
            +
                  <publisher><publishername>#{options[:publisher]}</publishername></publisher>
         
     | 
| 
      
 66 
     | 
    
         
            +
                  </info>
         
     | 
| 
      
 67 
     | 
    
         
            +
                  #{doc_toc}
         
     | 
| 
      
 68 
     | 
    
         
            +
                  #{doc_content}
         
     | 
| 
      
 69 
     | 
    
         
            +
                  </book>
         
     | 
| 
      
 70 
     | 
    
         
            +
            EOS
         
     | 
| 
      
 71 
     | 
    
         
            +
                end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                def gen_docbook_toc(toc)
         
     | 
| 
      
 74 
     | 
    
         
            +
                  "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
         
     | 
| 
      
 75 
     | 
    
         
            +
                end
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                def gen_docbook_tocdiv(toc)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  toc.map do |item|
         
     | 
| 
      
 79 
     | 
    
         
            +
                    text = Utils.escape_html(Utils.clean_text(item))
         
     | 
| 
      
 80 
     | 
    
         
            +
                    "<tocdiv><title>#{item}</title></tocdiv>"
         
     | 
| 
      
 81 
     | 
    
         
            +
                  end.join("")
         
     | 
| 
      
 82 
     | 
    
         
            +
                end
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                def gen_docbook_content(content)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  paras = extract_paras(content)
         
     | 
| 
      
 86 
     | 
    
         
            +
                  paras_content = paras.map do |para|
         
     | 
| 
      
 87 
     | 
    
         
            +
                    text = Utils.escape_html(Utils.clean_text(para))
         
     | 
| 
      
 88 
     | 
    
         
            +
                    "<para id='#{UUID.generate}'>#{text}</para>"
         
     | 
| 
      
 89 
     | 
    
         
            +
                  end.join("\n")
         
     | 
| 
      
 90 
     | 
    
         
            +
                  "<sect1>#{paras_content}</sect1>"
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
      
 93 
     | 
    
         
            +
                # extract_for_epub_text
         
     | 
| 
      
 94 
     | 
    
         
            +
                def extract_for_epub_text(content)
         
     | 
| 
      
 95 
     | 
    
         
            +
                  return content if content.blank?
         
     | 
| 
      
 96 
     | 
    
         
            +
                  if outline_type?(content)
         
     | 
| 
      
 97 
     | 
    
         
            +
                    extract_outline_with_content(content)
         
     | 
| 
      
 98 
     | 
    
         
            +
                  elsif toc_type?(content)
         
     | 
| 
      
 99 
     | 
    
         
            +
                    extract_toc_with_content(content)
         
     | 
| 
      
 100 
     | 
    
         
            +
                  else
         
     | 
| 
      
 101 
     | 
    
         
            +
                    ['',content]
         
     | 
| 
      
 102 
     | 
    
         
            +
                  end
         
     | 
| 
      
 103 
     | 
    
         
            +
                end
         
     | 
| 
      
 104 
     | 
    
         
            +
             
     | 
| 
      
 105 
     | 
    
         
            +
                def extract_outline_with_content(content)
         
     | 
| 
      
 106 
     | 
    
         
            +
                  outline_flag = false
         
     | 
| 
      
 107 
     | 
    
         
            +
                  lines = []
         
     | 
| 
      
 108 
     | 
    
         
            +
                  outline = []
         
     | 
| 
      
 109 
     | 
    
         
            +
                  content.each_line do |line|
         
     | 
| 
      
 110 
     | 
    
         
            +
                    if line.strip.downcase == 'document outline'
         
     | 
| 
      
 111 
     | 
    
         
            +
                      outline_flag = true
         
     | 
| 
      
 112 
     | 
    
         
            +
                      next
         
     | 
| 
      
 113 
     | 
    
         
            +
                    end
         
     | 
| 
      
 114 
     | 
    
         
            +
                    unless outline_flag
         
     | 
| 
      
 115 
     | 
    
         
            +
                      lines << line
         
     | 
| 
      
 116 
     | 
    
         
            +
                    else
         
     | 
| 
      
 117 
     | 
    
         
            +
                      outline << line;
         
     | 
| 
      
 118 
     | 
    
         
            +
                    end
         
     | 
| 
      
 119 
     | 
    
         
            +
                  end
         
     | 
| 
      
 120 
     | 
    
         
            +
                  [outline.join("") , lines.join("")]
         
     | 
| 
      
 121 
     | 
    
         
            +
                end
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                def extract_toc_with_content(content)
         
     | 
| 
      
 124 
     | 
    
         
            +
                  toc = []
         
     | 
| 
      
 125 
     | 
    
         
            +
                  lines = []
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
                  paras = extract_paras(content)
         
     | 
| 
      
 128 
     | 
    
         
            +
                  index = paras.index('Content')
         
     | 
| 
      
 129 
     | 
    
         
            +
                  paras = paras[(index+1)..-1]
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
      
 131 
     | 
    
         
            +
                  point = nil
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                  paras.each_with_index do |para, index|
         
     | 
| 
      
 134 
     | 
    
         
            +
                    if toc.include?(para)
         
     | 
| 
      
 135 
     | 
    
         
            +
                      point = index
         
     | 
| 
      
 136 
     | 
    
         
            +
                      break
         
     | 
| 
      
 137 
     | 
    
         
            +
                    else
         
     | 
| 
      
 138 
     | 
    
         
            +
                      toc << para
         
     | 
| 
      
 139 
     | 
    
         
            +
                    end
         
     | 
| 
      
 140 
     | 
    
         
            +
                  end
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
                  lines = paras[(point+1)..-1]
         
     | 
| 
      
 143 
     | 
    
         
            +
                  [toc.join("\n") , lines.join("\n")]
         
     | 
| 
      
 144 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 145 
     | 
    
         
            +
                  ['', content]
         
     | 
| 
      
 146 
     | 
    
         
            +
                end
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
      
 148 
     | 
    
         
            +
                def outline_type?(content)
         
     | 
| 
      
 149 
     | 
    
         
            +
                  content.each_line do |line|
         
     | 
| 
      
 150 
     | 
    
         
            +
                      return true if line.strip.downcase  == 'document outline'
         
     | 
| 
      
 151 
     | 
    
         
            +
                  end
         
     | 
| 
      
 152 
     | 
    
         
            +
                  false
         
     | 
| 
      
 153 
     | 
    
         
            +
                end
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                def toc_type?(content)
         
     | 
| 
      
 156 
     | 
    
         
            +
                  toc_flag = false
         
     | 
| 
      
 157 
     | 
    
         
            +
                  toc_flag1 = false
         
     | 
| 
      
 158 
     | 
    
         
            +
                  content.each_line do |line|
         
     | 
| 
      
 159 
     | 
    
         
            +
                      toc_flag = true if line.strip.downcase == '目录'
         
     | 
| 
      
 160 
     | 
    
         
            +
                      toc_flag1 = true if line.strip.downcase == 'content'
         
     | 
| 
      
 161 
     | 
    
         
            +
                  end
         
     | 
| 
      
 162 
     | 
    
         
            +
                  toc_flag && toc_flag1
         
     | 
| 
      
 163 
     | 
    
         
            +
                end
         
     | 
| 
      
 164 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/paras_in_mongo.rb
    CHANGED
    
    
    
        data/lib/txt_book.rb
    CHANGED
    
    | 
         @@ -87,18 +87,22 @@ class TxtBook 
     | 
|
| 
       87 
87 
     | 
    
         
             
              end
         
     | 
| 
       88 
88 
     | 
    
         | 
| 
       89 
89 
     | 
    
         
             
              def toc
         
     | 
| 
       90 
     | 
    
         
            -
                @toc ||= extract_toc_from_struct(struct_content)
         
     | 
| 
      
 90 
     | 
    
         
            +
                @toc ||= extract_toc_from_struct(struct_content) if struct_content
         
     | 
| 
       91 
91 
     | 
    
         
             
              end
         
     | 
| 
       92 
92 
     | 
    
         | 
| 
       93 
93 
     | 
    
         
             
              def toc_to_text
         
     | 
| 
       94 
     | 
    
         
            -
                 
     | 
| 
       95 
     | 
    
         
            -
                   
     | 
| 
      
 94 
     | 
    
         
            +
                if toc
         
     | 
| 
      
 95 
     | 
    
         
            +
                  gen_toc(toc) do |item,children|
         
     | 
| 
      
 96 
     | 
    
         
            +
                    "#{item[:title]}\n#{children}"
         
     | 
| 
      
 97 
     | 
    
         
            +
                  end
         
     | 
| 
       96 
98 
     | 
    
         
             
                end
         
     | 
| 
       97 
99 
     | 
    
         
             
              end
         
     | 
| 
       98 
100 
     | 
    
         | 
| 
       99 
101 
     | 
    
         
             
              def to_doc_book
         
     | 
| 
       100 
102 
     | 
    
         
             
                if struct_content
         
     | 
| 
       101 
103 
     | 
    
         
             
                  build_doc_book(struct_content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})    
         
     | 
| 
      
 104 
     | 
    
         
            +
                else
         
     | 
| 
      
 105 
     | 
    
         
            +
                  build_doc_book(content,{:title=>title,:publisher=>publisher,:pubdate=>pubdate,:author=>author,:isbn=>isbn})
         
     | 
| 
       102 
106 
     | 
    
         
             
                end
         
     | 
| 
       103 
107 
     | 
    
         
             
              end
         
     | 
| 
       104 
108 
     | 
    
         | 
| 
         @@ -184,12 +188,15 @@ class TxtBook 
     | 
|
| 
       184 
188 
     | 
    
         
             
                marked_content
         
     | 
| 
       185 
189 
     | 
    
         
             
              end
         
     | 
| 
       186 
190 
     | 
    
         | 
| 
       187 
     | 
    
         
            -
              def build_doc_book( 
     | 
| 
       188 
     | 
    
         
            -
                doc_toc = gen_docbook_toc(toc)
         
     | 
| 
      
 191 
     | 
    
         
            +
              def build_doc_book(content,options={})
         
     | 
| 
      
 192 
     | 
    
         
            +
                doc_toc = gen_docbook_toc(toc) if toc
         
     | 
| 
       189 
193 
     | 
    
         | 
| 
       190 
     | 
    
         
            -
                 
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
       192 
     | 
    
         
            -
             
     | 
| 
      
 194 
     | 
    
         
            +
                doc_content =  if content.is_a?(Array)
         
     | 
| 
      
 195 
     | 
    
         
            +
                  struct = content.map{|item| item if item.is_a?(Hash)}.compact
         
     | 
| 
      
 196 
     | 
    
         
            +
                  gen_docbook_content_with_struct(struct)
         
     | 
| 
      
 197 
     | 
    
         
            +
                else
         
     | 
| 
      
 198 
     | 
    
         
            +
                  gen_docbook_content(content)
         
     | 
| 
      
 199 
     | 
    
         
            +
                end
         
     | 
| 
       193 
200 
     | 
    
         | 
| 
       194 
201 
     | 
    
         
             
            <<-EOS
         
     | 
| 
       195 
202 
     | 
    
         
             
            <?xml version="1.0" encoding="utf-8"?>
         
     | 
| 
         @@ -208,6 +215,8 @@ class TxtBook 
     | 
|
| 
       208 
215 
     | 
    
         
             
            EOS
         
     | 
| 
       209 
216 
     | 
    
         
             
              end
         
     | 
| 
       210 
217 
     | 
    
         | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
       211 
220 
     | 
    
         
             
              def build_struct(content)
         
     | 
| 
       212 
221 
     | 
    
         
             
                stack = Array.new(8)
         
     | 
| 
       213 
222 
     | 
    
         
             
                struct = []
         
     | 
| 
         @@ -314,18 +323,20 @@ EOS 
     | 
|
| 
       314 
323 
     | 
    
         
             
              end
         
     | 
| 
       315 
324 
     | 
    
         | 
| 
       316 
325 
     | 
    
         
             
              def extract_toc_from_struct(struct)
         
     | 
| 
       317 
     | 
    
         
            -
                 
     | 
| 
       318 
     | 
    
         
            -
             
     | 
| 
       319 
     | 
    
         
            -
                   
     | 
| 
       320 
     | 
    
         
            -
                     
     | 
| 
       321 
     | 
    
         
            -
             
     | 
| 
       322 
     | 
    
         
            -
                       
     | 
| 
      
 326 
     | 
    
         
            +
                if struct
         
     | 
| 
      
 327 
     | 
    
         
            +
                  toc = []
         
     | 
| 
      
 328 
     | 
    
         
            +
                  struct.each do |item|
         
     | 
| 
      
 329 
     | 
    
         
            +
                    if item.is_a?(Hash)
         
     | 
| 
      
 330 
     | 
    
         
            +
                      children = []
         
     | 
| 
      
 331 
     | 
    
         
            +
                      if item[:children].any?
         
     | 
| 
      
 332 
     | 
    
         
            +
                        children = extract_toc_from_struct(item[:children])
         
     | 
| 
      
 333 
     | 
    
         
            +
                      end
         
     | 
| 
      
 334 
     | 
    
         
            +
                      item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
         
     | 
| 
      
 335 
     | 
    
         
            +
                      toc << item_hash
         
     | 
| 
       323 
336 
     | 
    
         
             
                    end
         
     | 
| 
       324 
     | 
    
         
            -
                    item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
         
     | 
| 
       325 
     | 
    
         
            -
                    toc << item_hash
         
     | 
| 
       326 
337 
     | 
    
         
             
                  end
         
     | 
| 
      
 338 
     | 
    
         
            +
                  toc
         
     | 
| 
       327 
339 
     | 
    
         
             
                end
         
     | 
| 
       328 
     | 
    
         
            -
                toc
         
     | 
| 
       329 
340 
     | 
    
         
             
              end
         
     | 
| 
       330 
341 
     | 
    
         | 
| 
       331 
342 
     | 
    
         
             
              def gen_docbook_toc(toc)
         
     | 
| 
         @@ -350,7 +361,16 @@ EOS 
     | 
|
| 
       350 
361 
     | 
    
         
             
                end
         
     | 
| 
       351 
362 
     | 
    
         
             
              end
         
     | 
| 
       352 
363 
     | 
    
         | 
| 
       353 
     | 
    
         
            -
              def gen_docbook_content( 
     | 
| 
      
 364 
     | 
    
         
            +
              def gen_docbook_content(content)
         
     | 
| 
      
 365 
     | 
    
         
            +
                paras = extract_paras(content)
         
     | 
| 
      
 366 
     | 
    
         
            +
                paras_content = paras.map do |para|
         
     | 
| 
      
 367 
     | 
    
         
            +
                  text = Utils.escape_html(Utils.clean_text(para))
         
     | 
| 
      
 368 
     | 
    
         
            +
                  "<para id='#{UUID.generate}'>#{text}</para>"
         
     | 
| 
      
 369 
     | 
    
         
            +
                end.join("\n")
         
     | 
| 
      
 370 
     | 
    
         
            +
                "<sect1>#{paras_content}</sect1>"
         
     | 
| 
      
 371 
     | 
    
         
            +
              end
         
     | 
| 
      
 372 
     | 
    
         
            +
             
     | 
| 
      
 373 
     | 
    
         
            +
              def gen_docbook_content_with_struct(struct)
         
     | 
| 
       354 
374 
     | 
    
         
             
                content = []
         
     | 
| 
       355 
375 
     | 
    
         
             
                struct.each do |item|
         
     | 
| 
       356 
376 
     | 
    
         
             
                  if item.is_a?(Hash)
         
     | 
    
        data/lib/utils.rb
    CHANGED
    
    | 
         @@ -256,4 +256,19 @@ module Utils 
     | 
|
| 
       256 
256 
     | 
    
         
             
                sections
         
     | 
| 
       257 
257 
     | 
    
         
             
              end
         
     | 
| 
       258 
258 
     | 
    
         | 
| 
      
 259 
     | 
    
         
            +
              def extract_text_from_file(filename,format)
         
     | 
| 
      
 260 
     | 
    
         
            +
                txt_file = File.basename(filename,format)
         
     | 
| 
      
 261 
     | 
    
         
            +
                if !filename.include?("'")
         
     | 
| 
      
 262 
     | 
    
         
            +
                  cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
         
     | 
| 
      
 263 
     | 
    
         
            +
                elsif !filename.include?('"')
         
     | 
| 
      
 264 
     | 
    
         
            +
                  cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
         
     | 
| 
      
 265 
     | 
    
         
            +
                else
         
     | 
| 
      
 266 
     | 
    
         
            +
                  cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
         
     | 
| 
      
 267 
     | 
    
         
            +
                end
         
     | 
| 
      
 268 
     | 
    
         
            +
                output = `#{cmd}`
         
     | 
| 
      
 269 
     | 
    
         
            +
                content = File.open("#{txt_file}.txt").read
         
     | 
| 
      
 270 
     | 
    
         
            +
                FileUtils.remove_file("#{txt_file}.txt",true)
         
     | 
| 
      
 271 
     | 
    
         
            +
                return content
         
     | 
| 
      
 272 
     | 
    
         
            +
              end  
         
     | 
| 
      
 273 
     | 
    
         
            +
             
     | 
| 
       259 
274 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: ebook_tools
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.5
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
         @@ -9,7 +9,7 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2013-04 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2013-06-04 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: uuid
         
     | 
| 
         @@ -156,7 +156,7 @@ dependencies: 
     | 
|
| 
       156 
156 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       157 
157 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       158 
158 
     | 
    
         
             
            description: 电子书工具集.
         
     | 
| 
       159 
     | 
    
         
            -
            email:  
     | 
| 
      
 159 
     | 
    
         
            +
            email: yalong1976@gmail.com
         
     | 
| 
       160 
160 
     | 
    
         
             
            executables:
         
     | 
| 
       161 
161 
     | 
    
         
             
            - ebook_tools
         
     | 
| 
       162 
162 
     | 
    
         
             
            - para_import_mongo
         
     | 
| 
         @@ -171,15 +171,16 @@ files: 
     | 
|
| 
       171 
171 
     | 
    
         
             
            - bin/doc_book_import_mongo
         
     | 
| 
       172 
172 
     | 
    
         
             
            - bin/para_import_scheduling
         
     | 
| 
       173 
173 
     | 
    
         
             
            - bin/xml2json
         
     | 
| 
      
 174 
     | 
    
         
            +
            - lib/doc_book_in_mongo.rb
         
     | 
| 
       174 
175 
     | 
    
         
             
            - lib/ebook_tools.rb
         
     | 
| 
       175 
     | 
    
         
            -
            - lib/ 
     | 
| 
      
 176 
     | 
    
         
            +
            - lib/epub.rb
         
     | 
| 
      
 177 
     | 
    
         
            +
            - lib/epub_book.rb
         
     | 
| 
       176 
178 
     | 
    
         
             
            - lib/header_detect.rb
         
     | 
| 
      
 179 
     | 
    
         
            +
            - lib/paras_in_mongo.rb
         
     | 
| 
       177 
180 
     | 
    
         
             
            - lib/pdf.rb
         
     | 
| 
       178 
181 
     | 
    
         
             
            - lib/txt.rb
         
     | 
| 
       179 
     | 
    
         
            -
            - lib/ 
     | 
| 
      
 182 
     | 
    
         
            +
            - lib/txt_book.rb
         
     | 
| 
       180 
183 
     | 
    
         
             
            - lib/utils.rb
         
     | 
| 
       181 
     | 
    
         
            -
            - lib/paras_in_mongo.rb
         
     | 
| 
       182 
     | 
    
         
            -
            - lib/doc_book_in_mongo.rb
         
     | 
| 
       183 
184 
     | 
    
         
             
            - workers/para_import_worker.rb
         
     | 
| 
       184 
185 
     | 
    
         
             
            - ebook_tools.gemspec
         
     | 
| 
       185 
186 
     | 
    
         
             
            homepage: 
         
     |