docsplit-paperclip-processor 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color --order random -f d
data/CHANGELOG CHANGED
@@ -1,5 +1,11 @@
1
- New in 0.2.0:
1
+ New in 0.2.2:
2
+ Feature: Added ability to save extracted text back to an ActiveRecord model's field
3
+ Documentation: Updated docs for extracting text and listed dependencies
4
+
5
+ New in 0.2.1:
6
+ Feature: Added Paperclip::DocsplitText for text extraction
2
7
 
8
+ New in 0.2.0:
3
9
  API CHANGE: Rails.logger no longer called
4
10
  Bug Fix: Was calling PaperclipError, now calls Paperclip::Error
5
11
  Test Coverage: Specs added for DocsplitProcessor, DocsplitPdf, DocsplitImage.
data/Gemfile CHANGED
@@ -1,4 +1,3 @@
1
1
  source "http://rubygems.org"
2
2
 
3
- gemspec
4
- gem "docsplit"
3
+ gemspec
data/README.md CHANGED
@@ -7,6 +7,7 @@ These include the Microsoft Office formats: doc, docx, ppt, xls and so on, as we
7
7
 
8
8
  * [Paperclip][0]
9
9
  * [Docsplit][1]
10
+ * [FileMagic][2]
10
11
 
11
12
  ## Installation ##
12
13
 
@@ -40,7 +41,25 @@ Use it as you would any other Paperclip processor. For example, in your model:
40
41
 
41
42
  which will convert your document into pdf.
42
43
 
43
- ### Extract information (text, metadata) and thumbnail ###
44
+ ### Extract text ###
45
+
46
+ WARNING: This feature is in alpha.
47
+
48
+ class Document < ActiveRecord::Base
49
+
50
+ has_attached_file :file,
51
+ :styles => {
52
+ :text => {
53
+ :processors => [:docsplit_text],
54
+ :full_text_column => :file_full_text
55
+ }
56
+ }
57
+
58
+ end
59
+
60
+ will extract the text from the file uploaded, and desposit the full text of the file into the column 'file_full_text'.
61
+
62
+ ### Extract metadata and thumbnail ###
44
63
 
45
64
  Will be include in the next releases.
46
65
 
@@ -14,10 +14,16 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.rubyforge_project = "docsplit-paperclip-processor"
16
16
 
17
- s.add_dependency "paperclip", "~> 2.4"
17
+ s.add_dependency "paperclip", "~> 3.1"
18
18
  s.add_dependency 'ruby-filemagic'
19
19
  s.add_dependency 'docsplit'
20
+ s.add_development_dependency 'bundler', ">= 1.1.4"
20
21
  s.add_development_dependency 'rspec'
22
+ s.add_development_dependency 'activerecord'
23
+ s.add_development_dependency 'sqlite3'
24
+
25
+ # Runtime dependencies
26
+ s.add_runtime_dependency "paperclip"
21
27
 
22
28
  s.files = `git ls-files`.split("\n")
23
29
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -1,3 +1,4 @@
1
+ require "docsplit"
1
2
  require "paperclip"
2
3
  require "filemagic"
3
4
 
@@ -17,59 +18,8 @@ module Paperclip
17
18
  File.expand_path(@src.path)
18
19
  end
19
20
  end
21
+ end
20
22
 
21
- class DocsplitChaining < Processor
22
- attr_accessor :options, :attachment
23
-
24
- def initialize(file, options = {}, attachment = nil)
25
- super
26
- @options = options
27
- @attachment = attachment
28
- end
29
-
30
- def make
31
- attachment.to_file(options[:from_style] || :original)
32
- end
33
- end
34
-
35
- class DocsplitPdf < DocsplitProcessor
36
- def make
37
- begin
38
- dst_dir = Dir.tmpdir
39
- dst_path = File.join(dst_dir, "#{@basename}.pdf")
40
-
41
- if pdf_format?
42
- dst_path = File.join(dst_dir, "_#{@basename}.pdf")
43
- FileUtils.copy_file(src_path, dst_path)
44
- else
45
- Docsplit.extract_pdf(src_path, :output => dst_dir)
46
- end
47
- rescue Exception => e
48
- raise Paperclip::Error, "There was an error converting #{@basename} to pdf"
49
- end
50
- File.open(dst_path)
51
- end
52
-
53
- def pdf_format?
54
- file_magic = FileMagic.new
55
- type = file_magic.file(src_path)
56
- file_magic.close
57
- type =~ /pdf/i
58
- end
59
- end
60
-
61
- class DocsplitImage < DocsplitProcessor
62
- def make
63
- begin
64
- dst_path = Dir.tmpdir
65
- pages = options[:pages] || [1]
66
- options = @options.merge(:output => dst_path)
67
-
68
- Docsplit.extract_images(src_path, options)
69
- rescue Exception => e
70
- raise Paperclip::Error, "There was an error extracting images from #{@basename}"
71
- end
72
- File.open(File.join(dst_path, "#{@basename}_#{pages.first}.#{@options[:format]}"))
73
- end
74
- end
75
- end
23
+ require 'processors/docsplit_image'
24
+ require 'processors/docsplit_pdf'
25
+ require 'processors/docsplit_text'
@@ -1,7 +1,7 @@
1
1
  module Docsplit
2
2
  module Paperclip
3
3
  module Processor
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.2"
5
5
  end
6
6
  end
7
7
  end
@@ -0,0 +1,21 @@
1
+ module Paperclip
2
+ class DocsplitImage < DocsplitProcessor
3
+ def make
4
+ begin
5
+ @dst_path = Dir.tmpdir
6
+ @pages = @options[:pages] || [1]
7
+ @options = @options.merge(:output => @dst_path)
8
+
9
+ Docsplit.extract_images(src_path, @options)
10
+ rescue Exception => e
11
+ raise Paperclip::Error, "There was an error extracting images from #{@basename}"
12
+ end
13
+
14
+ destination_file
15
+ end
16
+
17
+ def destination_file
18
+ File.open(File.join(@dst_path, "#{@basename}_#{@pages.first}.#{@options[:format]}"))
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ module Paperclip
2
+ class DocsplitPdf < DocsplitProcessor
3
+ def make
4
+ begin
5
+ dst_dir = Dir.tmpdir
6
+ dst_path = File.join(dst_dir, "#{@basename}.pdf")
7
+
8
+ if pdf_format?
9
+ dst_path = File.join(dst_dir, "_#{@basename}.pdf")
10
+ FileUtils.copy_file(src_path, dst_path)
11
+ else
12
+ Docsplit.extract_pdf(src_path, :output => dst_dir)
13
+ end
14
+ rescue Exception => e
15
+ raise Paperclip::Error, "There was an error converting #{@basename} to pdf"
16
+ end
17
+ File.open(dst_path)
18
+ end
19
+
20
+ def pdf_format?
21
+ file_magic = FileMagic.new
22
+ type = file_magic.file(src_path)
23
+ file_magic.close
24
+ type =~ /pdf/i
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ module Paperclip
2
+ class DocsplitText < DocsplitProcessor
3
+ def make
4
+ begin
5
+ @dst_path = Dir.tmpdir
6
+ @pages = @options[:pages] || [1]
7
+ @options = @options.merge(:output => @dst_path)
8
+
9
+ Docsplit.extract_text(src_path, @options)
10
+ rescue Exception => e
11
+ raise Paperclip::Error, "There was an error extracting text from #{@basename}"
12
+ end
13
+
14
+ if @options[:full_text_column]
15
+ # Bypassing callbacks to save full text. See Paperclip issue #671:
16
+ # https://github.com/thoughtbot/paperclip/issues/671
17
+ ar_model = @attachment.instance
18
+ ar_model[@options[:full_text_column]] = full_text
19
+ ar_model.run_callbacks(:save) { false }
20
+
21
+ # This would be the preferred method of saving this text.
22
+ # @attachment.instance.update_attribute(@options[:full_text_column], full_text)
23
+ end
24
+
25
+ destination_file
26
+ end
27
+
28
+ def destination_file
29
+ File.open(File.join(@dst_path, "#{@basename}.txt"))
30
+ end
31
+
32
+ def full_text
33
+ full_text = String.new
34
+
35
+ destination_file.each do |line|
36
+ full_text += line
37
+ end
38
+
39
+ full_text
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,65 @@
1
+ require 'spec_helper'
2
+
3
+ describe Paperclip::DocsplitText do
4
+ before(:all) do
5
+ @file = File.open("./fixtures/word_xml.docx")
6
+ end
7
+
8
+ after(:all) do
9
+ @file.close
10
+ end
11
+
12
+ context "with no options supplied" do
13
+ before(:all) do
14
+ @options = {}
15
+ @processor = Paperclip::DocsplitText.new(@file, @options)
16
+ end
17
+
18
+ it "#make sends the correct commands to Docsplit" do
19
+ Docsplit.should_receive(:extract_text).with(File.expand_path(@file.path), @options.merge(:output => Dir.tmpdir))
20
+
21
+ @processor.make
22
+ end
23
+
24
+ it "#make returns the text tempfile created by Docsplit" do
25
+ result = @processor.make
26
+
27
+ text = String.new
28
+ result.each do |line|
29
+ text += line
30
+ end
31
+
32
+ text.should eq("This is a test document.\n\n\f")
33
+ end
34
+ end
35
+
36
+ context "with a destination column for extracted text" do
37
+ before(:all) do
38
+ @options = {:full_text_column => :document_full_text}
39
+ @doc = Document.new()
40
+ end
41
+
42
+ after(:all) do
43
+ FileUtils.rm_rf("./spec/tmp", secure: true)
44
+ end
45
+
46
+ it "#make stores the full text in the specified field" do
47
+ @doc.original = @file
48
+ @doc.save!
49
+
50
+ @doc.reload
51
+
52
+ @doc.original_full_text.should eq("This is a test document.\n\n\f")
53
+ end
54
+ end
55
+
56
+ context "when processing fails" do
57
+ it "#make raises an error if the processing was unsuccessful" do
58
+ Dir.stub!(:tmpdir).and_return(:raise)
59
+
60
+ lambda {
61
+ Paperclip::DocsplitText.new(@file, {}).make
62
+ }.should raise_error(Paperclip::Error)
63
+ end
64
+ end
65
+ end
@@ -1,55 +1,47 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Paperclip::DocsplitImage do
4
- def pdf_jpg_images
5
- Dir.entries(Dir.tmpdir).reject{ |x| !(x =~ /twopage_\d.jpg/) }
4
+ def delete_temp_images
5
+ Dir.entries(Dir.tmpdir).reject{ |x| !(x =~ /twopage_\d.jpg/) }.each do |tempfile|
6
+ File.delete(File.join(Dir.tmpdir, tempfile))
7
+ end
8
+ end
9
+
10
+ before(:all) do
11
+ delete_temp_images
12
+ @file = File.open("./fixtures/twopage.pdf")
13
+ end
14
+
15
+ after(:all) do
16
+ delete_temp_images
17
+ @file.close
6
18
  end
7
19
 
8
20
  context "with a valid pdf file attachment" do
9
21
  before(:all) do
10
- pdf_jpg_images.each do |tempfile|
11
- File.delete(File.join(Dir.tmpdir, tempfile))
12
- end
13
-
14
- @file = File.open("./fixtures/twopage.pdf")
15
- @processor = Paperclip::DocsplitImage.new(@file, {:format => :jpg, :size => "50x50"})
16
- @output = @processor.make
22
+ @options = {:format => :jpg, :size => "50x50"}
23
+ @processor = Paperclip::DocsplitImage.new(@file, @options)
17
24
  end
18
25
 
19
- after(:all) do
20
- @file.close
26
+ it "#make sends the correct commands to Docsplit" do
27
+ @processor.stub!(:destination_file)
28
+ Docsplit.should_receive(:extract_images).with(File.expand_path(@file.path), @options.merge(:output => Dir.tmpdir))
29
+
30
+ @processor.make
21
31
  end
22
32
 
23
- it "#make generates an image for each page of the document" do
24
- pdf_jpg_images.count.should eq(2)
25
- end
26
-
27
- it "#make generates images at the specified resolution" do
28
- cmd = %Q[identify -format "%wx%h" "#{@output.path}"]
29
- `#{cmd}`.chomp.should eq("39x50")
30
- end
31
-
32
- it "#make generates images in the specified format" do
33
- pdf_jpg_images.each do |output_file|
34
- FileMagic.new.file(File.join(Dir.tmpdir, output_file)).should =~ /jpeg/i
35
- end
36
- end
37
-
38
33
  it "#make returns the image of the first page" do
39
- File.basename(@output).should eq('twopage_1.jpg')
34
+ @processor.make.path.should eq(File.open(Dir.tmpdir + '/twopage_1.jpg').path)
40
35
  end
41
36
  end
42
37
 
43
38
  context "when processing fails" do
44
39
  it "#make raises an error if the processing was unsuccessful" do
45
- @file = File.open("./fixtures/twopage.pdf")
46
40
  Dir.stub!(:tmpdir).and_return(:raise)
47
41
 
48
42
  lambda {
49
43
  Paperclip::DocsplitImage.new(@file, {:format => :jpg}).make
50
44
  }.should raise_error(Paperclip::Error)
51
-
52
- @file.close
53
45
  end
54
46
  end
55
47
  end
@@ -0,0 +1,10 @@
1
+ ActiveRecord::Schema.define :version => 0 do
2
+ create_table "documents", :force => true do |t|
3
+ t.string :owner
4
+ t.string :original_file_name
5
+ t.string :original_content_type
6
+ t.integer :original_updated_at
7
+ t.integer :original_file_size
8
+ t.text :original_full_text
9
+ end
10
+ end
@@ -1,7 +1,33 @@
1
- require "paperclip"
2
- require "filemagic"
3
- require "docsplit"
4
- require "docsplit-paperclip-processor"
1
+ require 'paperclip'
2
+ require 'paperclip/railtie'
3
+ require 'filemagic'
4
+ require 'docsplit'
5
+ require 'docsplit-paperclip-processor'
5
6
 
6
7
  require 'rspec'
7
- require 'rspec/autorun'
8
+ require 'rspec/autorun'
9
+
10
+ # Prepare activerecord
11
+ require "active_record"
12
+
13
+ # Connect to sqlite
14
+ ActiveRecord::Base.establish_connection(
15
+ "adapter" => "sqlite3",
16
+ "database" => ":memory:"
17
+ )
18
+
19
+ ActiveRecord::Base.logger = Logger.new(nil)
20
+ load(File.join(File.dirname(__FILE__), 'schema.rb'))
21
+
22
+ Paperclip::Railtie.insert
23
+
24
+ class Document < ActiveRecord::Base
25
+ has_attached_file :original,
26
+ :storage => :filesystem,
27
+ :path => "./spec/tmp/:id.:extension",
28
+ :url => "/spec/tmp/:id.:extension",
29
+ :styles => {
30
+ :text => {:full_text_column => :original_full_text}
31
+ },
32
+ :processors => [:docsplit_text]
33
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit-paperclip-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-14 00:00:00.000000000 Z
12
+ date: 2012-08-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: paperclip
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '2.4'
21
+ version: '3.1'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: '2.4'
29
+ version: '3.1'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: ruby-filemagic
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -59,6 +59,22 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.1.4
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.1.4
62
78
  - !ruby/object:Gem::Dependency
63
79
  name: rspec
64
80
  requirement: !ruby/object:Gem::Requirement
@@ -75,6 +91,54 @@ dependencies:
75
91
  - - ! '>='
76
92
  - !ruby/object:Gem::Version
77
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activerecord
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: sqlite3
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: paperclip
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
78
142
  description: This gem is simple Paperclip processor which uses Docsplit to convert
79
143
  uploaded files to pdf, or extract information/thumbnails from them
80
144
  email:
@@ -84,6 +148,7 @@ extensions: []
84
148
  extra_rdoc_files: []
85
149
  files:
86
150
  - .gitignore
151
+ - .rspec
87
152
  - CHANGELOG
88
153
  - Gemfile
89
154
  - README.md
@@ -96,9 +161,14 @@ files:
96
161
  - fixtures/word_xml.docx
97
162
  - lib/docsplit-paperclip-processor.rb
98
163
  - lib/docsplit-paperclip-processor/version.rb
164
+ - lib/processors/docsplit_image.rb
165
+ - lib/processors/docsplit_pdf.rb
166
+ - lib/processors/docsplit_text.rb
99
167
  - spec/docsplit-paperclip-processor_spec.rb
168
+ - spec/docsplit_extract_text_spec.rb
100
169
  - spec/docsplit_image_spec.rb
101
170
  - spec/docsplit_pdf_spec.rb
171
+ - spec/schema.rb
102
172
  - spec/spec_helper.rb
103
173
  homepage: https://github.com/tienle/docsplit-paperclip-processor
104
174
  licenses: []
@@ -126,7 +196,8 @@ specification_version: 3
126
196
  summary: A Paperclip processor for Docsplit
127
197
  test_files:
128
198
  - spec/docsplit-paperclip-processor_spec.rb
199
+ - spec/docsplit_extract_text_spec.rb
129
200
  - spec/docsplit_image_spec.rb
130
201
  - spec/docsplit_pdf_spec.rb
202
+ - spec/schema.rb
131
203
  - spec/spec_helper.rb
132
- has_rdoc: