RubyGems - EPUBChop - Versions diffs - 0.0.1 → 0.0.6 - Mend

EPUBChop 0.0.1 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +8 -8
data/EPUBChop.gemspec +1 -1
data/README.md +4 -0
data/bin/epubchop +3 -1
data/lib/EPUBChop/chop.rb +142 -34
data/lib/EPUBChop/version.rb +1 -1
data/spec/epub/default.epub +0 -0
data/spec/epubchop_spec.rb +1 -1
metadata +7 -6
/data/spec/{Verne_20000_West_pg11393.epub → epub/Verne_20000_West_pg11393.epub} +0 -0

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    NTk5MTRjMjQ1ZDk2YTEzM2E3MWNkN2ViODEzYzdlYWQ4ODE0YWE4NA==
+    MmM1ZTY5M2E0NjMwN2ViZDFkYzUzODAyZmZhN2VmOGVkZTkwNTNkYQ==
   data.tar.gz: !binary |-
-    YzYyMTA1YmYzOWY1N2UwNWRlM2ZiOTIwMmU0NTRhNGQ2YjE3YTc3Nw==
+    N2IwZThjYjA2Yjg3YjRhZDBiOTliMDY5Y2FmMzJmMjc4YTI3NGI3MA==
 SHA512:
   metadata.gz: !binary |-
-    MjI4ZDgzNWY0NzI3NmQ4YzAzNzNlNTZkMGMzZDA4MjIwMmVhOTQ4Y2EwODI5
-    ZWZjNDNkNDRhOWI4N2YxYWMxZjc5MDU2MjFkMTIzYWQ2MTk5YmI2YTczZjEx
-    ZWFjMWY4YTgwN2FkYjJiOGNlYTJhNTk5ZGY3N2VlZDE5MGU0NWM=
+    MTg1YWMwYmU5NzI1ZTEwZTQxNzJlM2M0YzU3MGY2ZGZjZTE4NDk1ZmJmMDkw
+    OTRkN2M0OTQ3ZDQxYThhMjRlN2ZhZjA5ZGExODIyNTg1NDczNTQ4MWM0MjU4
+    MzdmNDMyMWNiMDNhNzQ4ZmQ3NDY3ODgxNzQ2MjQ2ZDI5MmI1MGY=
   data.tar.gz: !binary |-
-    YWYyZDNiMDFkOGExMWY5MGFiM2Y4MDdkYWQ3MWNlMDkxMTQ2MDkzNzIxOWE1
-    Mzg3ZDBhNGVmMGJiZmQ0ODgzMmQzYmFkZWU2ZmNhYmZkMjUxMzc0NDQzNzE1
-    YWNhOGZhMTVmMzhiODFhMTY1ODAzNGE2MDI5YmE5MzI4MGI3ZmI=
+    NTM0NzdkZDNmY2E3MzkwOGU0ODQxZjA5YmQ4ZWRjNDM1N2JlYzhkY2Q5YTcy
+    MDZiMjUyZDZmNzY4NDE1YzJhZTA1NzY0MjUzOWQ1ZDc3ZmQ3N2FkMzBjYjZm
+    NzQ3NGM2ZDUyODU0ZmEwNTA4OTQ4NGUwYTJlNDgzNmJlNDUzYjg=

data/EPUBChop.gemspec CHANGED Viewed

@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
   spec.email         = ["mehmet@celik.be"]
   spec.description   = %q{Create EPUB previews}
   spec.summary       = %q{Removes unwanted content from an EPUB}
-  spec.homepage      = ""
+  spec.homepage      = "https://github.com/mehmetc/EPUBChop"
   spec.license       = "MIT"
   spec.files         = `git ls-files`.split($/)

data/README.md CHANGED Viewed

@@ -19,10 +19,14 @@ where [options] are:
 ```
 ### Example:
+Create a new EPUB with 10% of the content all other pages should contain the lines "Want to read more? Buy the book!"
 ```ruby
 epubchop --words 10 --base percentage -line1 "Want to read more?" -line2 "Buy the book!" my.epub
 ```
+This gem depends on [![epubinfo](http://github.com/chdorner/epubinfo)] I made some additions to the gem but they are still in a branch. Until they get accepted I'll be using the [![epubinfo_with_toc](https://github.com/mehmetc/epubinfo/tree/table_of_contents)]
+gem.
 ## Contributing to EPUBChop
 * Fork the project.
 * Create a new branch to implement your bugfixes or features

data/bin/epubchop CHANGED Viewed

@@ -18,6 +18,7 @@ BANNER
   opt :base,  "How to interprete the --words options... Possible value: percentage", :type => :string, :default => 'percentage'
   opt :line1, "Text that is shown on line 1 of the chopped pages", :type => :string, :default => 'Continue reading?'
   opt :line2, "Text that is shown on line 2 of the chopped pages", :type => :string, :default => 'Go to your local library or buy the book.'
+  opt :chop_by, "Follow the SPINE or the NCX of the ePub", :type => :string, :default => :spine
 end
 Trollop::die "need an EPUB file name" if ARGV.empty?
@@ -30,11 +31,12 @@ begin
     text  = []
     text << options[:line1] if options.has_key?(:line1)
     text << options[:line2] if options.has_key?(:line2)
+    chop_by << options[:chop_by]
     puts 'loading EPUB'
     b=EPUBChop.get(filename)
     puts 'chopping EPUB'
-    c=b.chop({:base => base.to_s, :words => words, :text => text})
+    c=b.chop({:base => base.to_s, :words => words, :text => text, :chop_by => chop_by})
     puts 'rebuilding EPUB'
     FileUtils.move(c, "chopped_#{File.basename(filename)}")

data/lib/EPUBChop/chop.rb CHANGED Viewed

@@ -10,7 +10,6 @@ module EPUBChop
     def initialize(input, options ={})
       set_defaults(options)
       raise 'Please supply an input file name' if input.nil?
       #count the number of words in a file
@@ -31,14 +30,40 @@ module EPUBChop
       set_defaults(options)
       original_zip_file = @book.table_of_contents.parser.zip_file
+      extract_dir = extract_epub_to_tmp_dir(original_zip_file)
+      chop_files_in_tmp_dir(extract_dir)
+      remove_unused_media_from_tmp_dir(extract_dir)
+      return rebuild_epub_from_tmp_dir(extract_dir)
+    rescue Zip::ZipError => e
+      raise RuntimeError, "Error processing EPUB. #{e.message}"
+    rescue Exception => e
+      puts "Chopping went wrong. #{e.message}"
+      puts e.backtrace
+      return nil
+    ensure
+      FileUtils.remove_entry_secure(extract_dir)
+    end
+    private
+    def extract_epub_to_tmp_dir(original_zip_file)
       #unzip in temp dir
       extract_dir = Dir.mktmpdir('epub_extract')
       original_zip_file.entries.each do |e|
         file_dir = File.split(e.name)[0]
-        Dir.mkdir(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?(".")
+        FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.')
         original_zip_file.extract(e, File.join(extract_dir, e.name))
       end
+      extract_dir
+    end
+    def chop_files_in_tmp_dir(extract_dir)
       #fix spine files
       filename_list = @resource_word_count.keys
       filename_list.each do |filename|
@@ -50,27 +75,39 @@ module EPUBChop
             FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
             FileUtils.touch "#{extract_dir}/#{filename}"
             File.open("#{extract_dir}/#{filename}", 'w') do |f|
-              f.puts empty_file
+              f.puts empty_file_with_cover(filename)
             end
           else
+            #noinspection RubyResolve
             resource = Nokogiri::XML(@book.table_of_contents.resources[filename]) do |config|
               config.noblanks.nonet
             end
             resource.css('script').remove
             resource.css('style').remove
             resource_text = resource.at_css('body').text.split[0..processed_file_size]
-            resource_text_length = resource_text.length
+            #resource_text_length = resource_text.length
             # get a string that can be found
             data = nil
-            window_begin = 5
+            window_begin = default_window_begin = 5
             window_end = 0
             while data.nil?
-              look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)].join(' ')
-              data = resource.at_css("p:contains('#{look_for}')")
-              window_begin += 1
-              window_end += 1
+              look_for = resource_text[(processed_file_size - window_begin)..(processed_file_size - window_end)]
+              if look_for.nil?
+                window_begin = default_window_begin += 5
+                window_end = 0
+              else
+                data = resource.at_css("*:contains('#{look_for.join(' ')}')")
+                window_begin -= 1
+                window_end += 1
+                if window_begin == window_end
+                  window_begin = default_window_begin += 5
+                  window_end = 0
+                end
+              end
             end
             #limit on found string
@@ -92,8 +129,9 @@ module EPUBChop
           end
         end
       end
-      #TODO:remove unwanted media
+    end
+    def rebuild_epub_from_tmp_dir(extract_dir)
       #zip new ebook
       new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
       new_ebook_name_path = new_ebook_name.path
@@ -101,24 +139,56 @@ module EPUBChop
       zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)
-      Dir[File.join(extract_dir, '**', '**')].each do |file|
+      epub_files = Dir[File.join(extract_dir, '**', '**')]
+      #minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
+      mimetype = epub_files.delete("#{extract_dir}/mimetype")
+      mimetype_entry = Zip::Entry.new(zipfile, mimetype.sub("#{extract_dir}/", ''), '', '', 0,0, Zip::Entry::STORED)
+      zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?
+      #all the other files
+      epub_files.each do |file|
         zipfile.add(file.sub("#{extract_dir}/", ''), file)
       end
       zipfile.close
-      return new_ebook_name_path
-    rescue Zip::ZipError => e
-      raise RuntimeError, ''
-    rescue Exception => e
-      puts "Chopping went wrong. #{e.message}"
-      puts e.backtrace
+      new_ebook_name_path
+    end
-      return nil
-    ensure
-      FileUtils.remove_entry_secure(extract_dir)
+    #noinspection RubyInstanceMethodNamingConvention
+    def remove_unused_media_from_tmp_dir(extract_dir)
+      #TODO: remove other media
+      #TODO: rebuild toc.ncx and content.opf
+      remove_unused_images_from_tmp_dir(extract_dir)
+    end
+    #noinspection RubyInstanceMethodNamingConvention
+    def remove_unused_images_from_tmp_dir(extract_dir)
+      puts 'removing unused media'
+      not_to_be_deleted_images = []
+      all_images = @book.table_of_contents.resources.images.map {|i| i[:uri]}
+      @book.table_of_contents.resources.html.each do |resource|
+        file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))
+        all_images.each do |image|
+          i = image.split('/').last
+          data = file.at_css("img[src$='#{i}']")
+          if data
+            not_to_be_deleted_images << image
+          end
+        end
+      end
+      to_be_deleted_images = (all_images - not_to_be_deleted_images)
+      to_be_deleted_images.each do |image|
+        puts "\t\tremoving #{image}"
+        File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")
+      end
+      to_be_deleted_images
     end
-    private
     def set_defaults(options)
       @words = options[:words] || 10
@@ -130,34 +200,69 @@ module EPUBChop
         @text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
         @text2 = ''
       end
+      @chop_by = options[:chop_by] || :spine
     end
-    def empty_file
+    def empty_file_with_cover(filename)
+      number_of_subdirectories = filename.split('/').size - 1
+      cover_path = ''
+      number_of_subdirectories.times{ cover_path += '../'}
+      cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''
       data = <<DATA
 <?xml version="1.0" encoding="utf-8" standalone="no"?>
-<!DOCTYPE html>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<title>Read more</title>
-</head>
-<body>
-<center>
-<div style='width:100%;border:1px solid black;margin-top:20px;padding:5px'>
-<div><h2>#{@text1}</h2></div>
-<div><h2>#{@text2}</h2></div>
-</div>
-</center>
+  <head>
+      <title>Read more</title>
+  </head>
+  <body>
+  <div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;">
+    <div style='text-align:center;'>
+      <h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2>
+      <span>#{CGI.escape_html(@text2 ? @text2 : '')}</span>
+    </div>
+    <div style="margin-top:20px;">
+      <div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;">
+        <img src="#{cover_path}" alt="" style="width:100%" />
+      </div>
+      <div style='padding-top:10px;'>
+        <h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '' )}</h3>
+      </div>
+      <div>
+        <h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4>
+      </div>
+    </div>
+    <br />
+    <div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div>
+  </div>
 </body>
 </html>
 DATA
+      data
     end
     def count_words(input)
       @book = EPUBInfo.get(input)
       resource_word_count = {}
       if @book
-        @book.table_of_contents.resources.spine.each do |resource|
+        chop_by = @chop_by.eql?(:ncx)  ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine
+        chop_by.each do |resource|
           raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
+            #noinspection RubyResolve
             config.noblanks.nonet
           end
           raw.css('script').remove
@@ -187,11 +292,14 @@ DATA
       resource_allowed_word_count = @resource_word_count.select do |r|
         (word_counter += @resource_word_count[r]) < allowed_words
       end
       word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }
       how_many_words_left = allowed_words - word_counter
       if how_many_words_left > 0
         resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]
+        #noinspection RubyLocalVariableNamingConvention
         word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
         if  how_many_words_left < word_count_of_resource_to_split
           resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)

data/lib/EPUBChop/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module EPUBChop
-  VERSION = "0.0.1"
+  VERSION = "0.0.6"
 end

data/spec/epub/default.epub ADDED Viewed

Binary file

data/spec/epubchop_spec.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'spec_helper'
 describe 'EPUBChop' do
     before(:all) do
       #chop EPUB at 10% of total words
-      @chop = EPUBChop.get('./spec/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
+      @chop = EPUBChop.get('./spec/epub/Verne_20000_West_pg11393.epub', {:base => :percentage, :words => 10})
     end
     it 'load an epub' do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: EPUBChop
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.6
 platform: ruby
 authors:
 - Mehmet Celik
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-28 00:00:00.000000000 Z
+date: 2014-01-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -114,10 +114,11 @@ files:
 - lib/EPUBChop/chop.rb
 - lib/EPUBChop/version.rb
 - lib/trollop.rb
-- spec/Verne_20000_West_pg11393.epub
+- spec/epub/Verne_20000_West_pg11393.epub
+- spec/epub/default.epub
 - spec/epubchop_spec.rb
 - spec/spec_helper.rb
-homepage: ''
+homepage: https://github.com/mehmetc/EPUBChop
 licenses:
 - MIT
 metadata: {}
@@ -142,7 +143,7 @@ signing_key:
 specification_version: 4
 summary: Removes unwanted content from an EPUB
 test_files:
-- spec/Verne_20000_West_pg11393.epub
+- spec/epub/Verne_20000_West_pg11393.epub
+- spec/epub/default.epub
 - spec/epubchop_spec.rb
 - spec/spec_helper.rb
-has_rdoc:

/data/spec/{Verne_20000_West_pg11393.epub → epub/Verne_20000_West_pg11393.epub} RENAMED Viewed

File without changes