RubyGems - docsplit-paperclip-processor - Versions diffs - 0.2.0 → 0.2.2 - Mend

docsplit-paperclip-processor 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/.rspec +1 -0
data/CHANGELOG +7 -1
data/Gemfile +1 -2
data/README.md +20 -1
data/docsplit-paperclip-processor.gemspec +7 -1
data/lib/docsplit-paperclip-processor.rb +5 -55
data/lib/docsplit-paperclip-processor/version.rb +1 -1
data/lib/processors/docsplit_image.rb +21 -0
data/lib/processors/docsplit_pdf.rb +27 -0
data/lib/processors/docsplit_text.rb +42 -0
data/spec/docsplit_extract_text_spec.rb +65 -0
data/spec/docsplit_image_spec.rb +22 -30
data/spec/schema.rb +10 -0
data/spec/spec_helper.rb +31 -5
metadata +76 -5

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ --color --order random -f d

data/CHANGELOG CHANGED

@@ -1,5 +1,11 @@
-New in 0.2.0:
+New in 0.2.2:
+Feature: Added ability to save extracted text back to an ActiveRecord model's field
+Documentation: Updated docs for extracting text and listed dependencies
+New in 0.2.1:
+Feature: Added Paperclip::DocsplitText for text extraction
+New in 0.2.0:
 API CHANGE: Rails.logger no longer called
 Bug Fix: Was calling PaperclipError, now calls Paperclip::Error
 Test Coverage: Specs added for DocsplitProcessor, DocsplitPdf, DocsplitImage.

data/Gemfile CHANGED

@@ -1,4 +1,3 @@
 source "http://rubygems.org"
-gemspec
-gem "docsplit"
+gemspec

data/README.md CHANGED

@@ -7,6 +7,7 @@ These include the Microsoft Office formats: doc, docx, ppt, xls and so on, as we
 * [Paperclip][0]
 * [Docsplit][1]
+* [FileMagic][2]
 ## Installation ##
@@ -40,7 +41,25 @@ Use it as you would any other Paperclip processor. For example, in your model:
 which will convert your document into pdf.
-### Extract information (text, metadata) and thumbnail ###
+### Extract text ###
+WARNING: This feature is in alpha.
+    class Document < ActiveRecord::Base
+      has_attached_file :file,
+                        :styles => {
+                          :text => {
+                            :processors => [:docsplit_text],
+                            :full_text_column => :file_full_text
+                          }
+                        }
+    end
+will extract the text from the file uploaded, and desposit the full text of the file into the column 'file_full_text'.
+### Extract metadata and thumbnail ###
 Will be include in the next releases.

data/docsplit-paperclip-processor.gemspec CHANGED

@@ -14,10 +14,16 @@ Gem::Specification.new do |s|
   s.rubyforge_project = "docsplit-paperclip-processor"
-  s.add_dependency "paperclip", "~> 2.4"
+  s.add_dependency "paperclip", "~> 3.1"
   s.add_dependency 'ruby-filemagic'
   s.add_dependency 'docsplit'
+  s.add_development_dependency 'bundler', ">= 1.1.4"
   s.add_development_dependency 'rspec'
+  s.add_development_dependency 'activerecord'
+  s.add_development_dependency 'sqlite3'
+  # Runtime dependencies
+  s.add_runtime_dependency "paperclip"
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")

data/lib/docsplit-paperclip-processor.rb CHANGED

@@ -1,3 +1,4 @@
+require "docsplit"
 require "paperclip"
 require "filemagic"
@@ -17,59 +18,8 @@ module Paperclip
       File.expand_path(@src.path)
     end
   end
+end
-  class DocsplitChaining < Processor
-    attr_accessor :options, :attachment
-    def initialize(file, options = {}, attachment = nil)
-      super
-      @options    = options
-      @attachment = attachment
-    end
-    def make
-      attachment.to_file(options[:from_style] || :original)
-    end
-  end
-  class DocsplitPdf < DocsplitProcessor
-    def make
-      begin
-        dst_dir = Dir.tmpdir
-        dst_path = File.join(dst_dir, "#{@basename}.pdf")
-        if pdf_format?
-          dst_path = File.join(dst_dir, "_#{@basename}.pdf")
-          FileUtils.copy_file(src_path, dst_path)
-        else
-          Docsplit.extract_pdf(src_path, :output => dst_dir)
-        end
-      rescue Exception => e
-        raise Paperclip::Error, "There was an error converting #{@basename} to pdf"
-      end
-      File.open(dst_path)
-    end
-    def pdf_format?
-      file_magic = FileMagic.new
-      type = file_magic.file(src_path)
-      file_magic.close
-      type =~ /pdf/i
-    end
-  end
-  class DocsplitImage < DocsplitProcessor
-    def make
-      begin
-        dst_path = Dir.tmpdir
-        pages    = options[:pages] || [1]
-        options  = @options.merge(:output => dst_path)
-        Docsplit.extract_images(src_path, options)
-      rescue Exception => e
-        raise Paperclip::Error, "There was an error extracting images from #{@basename}"
-      end
-      File.open(File.join(dst_path, "#{@basename}_#{pages.first}.#{@options[:format]}"))
-    end
-  end
-end
+require 'processors/docsplit_image'
+require 'processors/docsplit_pdf'
+require 'processors/docsplit_text'

data/lib/docsplit-paperclip-processor/version.rb CHANGED

@@ -1,7 +1,7 @@
 module Docsplit
   module Paperclip
     module Processor
-      VERSION = "0.2.0"
+      VERSION = "0.2.2"
     end
   end
 end

data/lib/processors/docsplit_image.rb ADDED

@@ -0,0 +1,21 @@
+module Paperclip
+  class DocsplitImage < DocsplitProcessor
+    def make
+      begin
+        @dst_path = Dir.tmpdir
+        @pages    = @options[:pages] || [1]
+        @options  = @options.merge(:output => @dst_path)
+        Docsplit.extract_images(src_path, @options)
+      rescue Exception => e
+        raise Paperclip::Error, "There was an error extracting images from #{@basename}"
+      end
+      destination_file
+    end
+    def destination_file
+      File.open(File.join(@dst_path, "#{@basename}_#{@pages.first}.#{@options[:format]}"))
+    end
+  end
+end

data/lib/processors/docsplit_pdf.rb ADDED

@@ -0,0 +1,27 @@
+module Paperclip
+ class DocsplitPdf < DocsplitProcessor
+    def make
+      begin
+        dst_dir = Dir.tmpdir
+        dst_path = File.join(dst_dir, "#{@basename}.pdf")
+        if pdf_format?
+          dst_path = File.join(dst_dir, "_#{@basename}.pdf")
+          FileUtils.copy_file(src_path, dst_path)
+        else
+          Docsplit.extract_pdf(src_path, :output => dst_dir)
+        end
+      rescue Exception => e
+        raise Paperclip::Error, "There was an error converting #{@basename} to pdf"
+      end
+      File.open(dst_path)
+    end
+    def pdf_format?
+      file_magic = FileMagic.new
+      type = file_magic.file(src_path)
+      file_magic.close
+      type =~ /pdf/i
+    end
+  end
+end

data/lib/processors/docsplit_text.rb ADDED

@@ -0,0 +1,42 @@
+module Paperclip
+  class DocsplitText < DocsplitProcessor
+    def make
+      begin
+        @dst_path = Dir.tmpdir
+        @pages    = @options[:pages] || [1]
+        @options  = @options.merge(:output => @dst_path)
+        Docsplit.extract_text(src_path, @options)
+      rescue Exception => e
+        raise Paperclip::Error, "There was an error extracting text from #{@basename}"
+      end
+      if @options[:full_text_column]
+        # Bypassing callbacks to save full text. See Paperclip issue #671:
+        # https://github.com/thoughtbot/paperclip/issues/671
+        ar_model = @attachment.instance
+        ar_model[@options[:full_text_column]] = full_text
+        ar_model.run_callbacks(:save) { false }
+        # This would be the preferred method of saving this text.
+        # @attachment.instance.update_attribute(@options[:full_text_column], full_text)
+      end
+      destination_file
+    end
+    def destination_file
+      File.open(File.join(@dst_path, "#{@basename}.txt"))
+    end
+    def full_text
+      full_text = String.new
+      destination_file.each do |line|
+        full_text += line
+      end
+      full_text
+    end
+  end
+end

data/spec/docsplit_extract_text_spec.rb ADDED

@@ -0,0 +1,65 @@
+require 'spec_helper'
+describe Paperclip::DocsplitText do
+  before(:all) do
+    @file = File.open("./fixtures/word_xml.docx")
+  end
+  after(:all) do
+    @file.close
+  end
+  context "with no options supplied" do
+    before(:all) do
+      @options = {}
+      @processor = Paperclip::DocsplitText.new(@file, @options)
+    end
+    it "#make sends the correct commands to Docsplit" do
+      Docsplit.should_receive(:extract_text).with(File.expand_path(@file.path), @options.merge(:output => Dir.tmpdir))
+      @processor.make
+    end
+    it "#make returns the text tempfile created by Docsplit" do
+      result = @processor.make
+      text = String.new
+      result.each do |line|
+        text += line
+      end
+      text.should eq("This is a test document.\n\n\f")
+    end
+  end
+  context "with a destination column for extracted text" do
+    before(:all) do
+      @options = {:full_text_column => :document_full_text}
+      @doc = Document.new()
+    end
+    after(:all) do
+      FileUtils.rm_rf("./spec/tmp", secure: true)
+    end
+    it "#make stores the full text in the specified field" do
+      @doc.original = @file
+      @doc.save!
+      @doc.reload
+      @doc.original_full_text.should eq("This is a test document.\n\n\f")
+    end
+  end
+  context "when processing fails" do
+    it "#make raises an error if the processing was unsuccessful" do
+      Dir.stub!(:tmpdir).and_return(:raise)
+      lambda {
+        Paperclip::DocsplitText.new(@file, {}).make
+      }.should raise_error(Paperclip::Error)
+    end
+  end
+end

data/spec/docsplit_image_spec.rb CHANGED

@@ -1,55 +1,47 @@
 require 'spec_helper'
 describe Paperclip::DocsplitImage do
-  def pdf_jpg_images
-    Dir.entries(Dir.tmpdir).reject{ |x| !(x =~ /twopage_\d.jpg/) }
+  def delete_temp_images
+    Dir.entries(Dir.tmpdir).reject{ |x| !(x =~ /twopage_\d.jpg/) }.each do |tempfile|
+      File.delete(File.join(Dir.tmpdir, tempfile))
+    end
+  end
+  before(:all) do
+    delete_temp_images
+    @file = File.open("./fixtures/twopage.pdf")
+  end
+  after(:all) do
+    delete_temp_images
+    @file.close
   end
   context "with a valid pdf file attachment" do
     before(:all) do
-      pdf_jpg_images.each do |tempfile|
-        File.delete(File.join(Dir.tmpdir, tempfile))
-      end
-      @file = File.open("./fixtures/twopage.pdf")
-      @processor = Paperclip::DocsplitImage.new(@file, {:format => :jpg, :size => "50x50"})
-      @output = @processor.make
+      @options = {:format => :jpg, :size => "50x50"}
+      @processor = Paperclip::DocsplitImage.new(@file, @options)
     end
-    after(:all) do
-      @file.close
+    it "#make sends the correct commands to Docsplit" do
+      @processor.stub!(:destination_file)
+      Docsplit.should_receive(:extract_images).with(File.expand_path(@file.path), @options.merge(:output => Dir.tmpdir))
+      @processor.make
     end
-    it "#make generates an image for each page of the document" do
-      pdf_jpg_images.count.should eq(2)
-    end
-    it "#make generates images at the specified resolution" do
-      cmd = %Q[identify -format "%wx%h" "#{@output.path}"]
-      `#{cmd}`.chomp.should eq("39x50")
-    end
-    it "#make generates images in the specified format" do
-      pdf_jpg_images.each do |output_file|
-        FileMagic.new.file(File.join(Dir.tmpdir, output_file)).should =~ /jpeg/i
-      end
-    end
     it "#make returns the image of the first page" do
-      File.basename(@output).should eq('twopage_1.jpg')
+      @processor.make.path.should eq(File.open(Dir.tmpdir + '/twopage_1.jpg').path)
     end
   end
   context "when processing fails" do
     it "#make raises an error if the processing was unsuccessful" do
-      @file = File.open("./fixtures/twopage.pdf")
       Dir.stub!(:tmpdir).and_return(:raise)
       lambda {
         Paperclip::DocsplitImage.new(@file, {:format => :jpg}).make
       }.should raise_error(Paperclip::Error)
-      @file.close
     end
   end
 end

data/spec/schema.rb ADDED

@@ -0,0 +1,10 @@
+ActiveRecord::Schema.define :version => 0 do
+  create_table "documents", :force => true do |t|
+    t.string :owner
+    t.string :original_file_name
+    t.string :original_content_type
+    t.integer :original_updated_at
+    t.integer :original_file_size
+    t.text :original_full_text
+  end
+end

data/spec/spec_helper.rb CHANGED

@@ -1,7 +1,33 @@
-require "paperclip"
-require "filemagic"
-require "docsplit"
-require "docsplit-paperclip-processor"
+require 'paperclip'
+require 'paperclip/railtie'
+require 'filemagic'
+require 'docsplit'
+require 'docsplit-paperclip-processor'
 require 'rspec'
-require 'rspec/autorun'
+require 'rspec/autorun'
+# Prepare activerecord
+require "active_record"
+# Connect to sqlite
+ActiveRecord::Base.establish_connection(
+  "adapter" => "sqlite3",
+  "database" => ":memory:"
+)
+ActiveRecord::Base.logger = Logger.new(nil)
+load(File.join(File.dirname(__FILE__), 'schema.rb'))
+Paperclip::Railtie.insert
+class Document < ActiveRecord::Base
+	has_attached_file :original,
+		:storage => :filesystem,
+    	:path => "./spec/tmp/:id.:extension",
+    	:url => "/spec/tmp/:id.:extension",
+    	:styles => {
+    		:text => {:full_text_column => :original_full_text}
+    	},
+    	:processors => [:docsplit_text]
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: docsplit-paperclip-processor
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-14 00:00:00.000000000 Z
+date: 2012-08-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: paperclip
@@ -18,7 +18,7 @@ dependencies:
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: '2.4'
+        version: '3.1'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: '2.4'
+        version: '3.1'
 - !ruby/object:Gem::Dependency
   name: ruby-filemagic
   requirement: !ruby/object:Gem::Requirement
@@ -59,6 +59,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.1.4
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.1.4
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -75,6 +91,54 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: activerecord
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: paperclip
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: This gem is simple Paperclip processor which uses Docsplit to convert
   uploaded files to pdf, or extract information/thumbnails from them
 email:
@@ -84,6 +148,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - .gitignore
+- .rspec
 - CHANGELOG
 - Gemfile
 - README.md
@@ -96,9 +161,14 @@ files:
 - fixtures/word_xml.docx
 - lib/docsplit-paperclip-processor.rb
 - lib/docsplit-paperclip-processor/version.rb
+- lib/processors/docsplit_image.rb
+- lib/processors/docsplit_pdf.rb
+- lib/processors/docsplit_text.rb
 - spec/docsplit-paperclip-processor_spec.rb
+- spec/docsplit_extract_text_spec.rb
 - spec/docsplit_image_spec.rb
 - spec/docsplit_pdf_spec.rb
+- spec/schema.rb
 - spec/spec_helper.rb
 homepage: https://github.com/tienle/docsplit-paperclip-processor
 licenses: []
@@ -126,7 +196,8 @@ specification_version: 3
 summary: A Paperclip processor for Docsplit
 test_files:
 - spec/docsplit-paperclip-processor_spec.rb
+- spec/docsplit_extract_text_spec.rb
 - spec/docsplit_image_spec.rb
 - spec/docsplit_pdf_spec.rb
+- spec/schema.rb
 - spec/spec_helper.rb
-has_rdoc: