docsplit-paperclip-processor 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color --order random -f d
data/CHANGELOG CHANGED
@@ -1,5 +1,11 @@
1
- New in 0.2.0:
1
+ New in 0.2.2:
2
+ Feature: Added ability to save extracted text back to an ActiveRecord model's field
3
+ Documentation: Updated docs for extracting text and listed dependencies
4
+
5
+ New in 0.2.1:
6
+ Feature: Added Paperclip::DocsplitText for text extraction
2
7
 
8
+ New in 0.2.0:
3
9
  API CHANGE: Rails.logger no longer called
4
10
  Bug Fix: Was calling PaperclipError, now calls Paperclip::Error
5
11
  Test Coverage: Specs added for DocsplitProcessor, DocsplitPdf, DocsplitImage.
data/Gemfile CHANGED
@@ -1,4 +1,3 @@
1
1
  source "http://rubygems.org"
2
2
 
3
- gemspec
4
- gem "docsplit"
3
+ gemspec
data/README.md CHANGED
@@ -7,6 +7,7 @@ These include the Microsoft Office formats: doc, docx, ppt, xls and so on, as we
7
7
 
8
8
  * [Paperclip][0]
9
9
  * [Docsplit][1]
10
+ * [FileMagic][2]
10
11
 
11
12
  ## Installation ##
12
13
 
@@ -40,7 +41,25 @@ Use it as you would any other Paperclip processor. For example, in your model:
40
41
 
41
42
  which will convert your document into pdf.
42
43
 
43
- ### Extract information (text, metadata) and thumbnail ###
44
+ ### Extract text ###
45
+
46
+ WARNING: This feature is in alpha.
47
+
48
+ class Document < ActiveRecord::Base
49
+
50
+ has_attached_file :file,
51
+ :styles => {
52
+ :text => {
53
+ :processors => [:docsplit_text],
54
+ :full_text_column => :file_full_text
55
+ }
56
+ }
57
+
58
+ end
59
+
60
+ will extract the text from the file uploaded, and desposit the full text of the file into the column 'file_full_text'.
61
+
62
+ ### Extract metadata and thumbnail ###
44
63
 
45
64
  Will be include in the next releases.
46
65
 
@@ -14,10 +14,16 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.rubyforge_project = "docsplit-paperclip-processor"
16
16
 
17
- s.add_dependency "paperclip", "~> 2.4"
17
+ s.add_dependency "paperclip", "~> 3.1"
18
18
  s.add_dependency 'ruby-filemagic'
19
19
  s.add_dependency 'docsplit'
20
+ s.add_development_dependency 'bundler', ">= 1.1.4"
20
21
  s.add_development_dependency 'rspec'
22
+ s.add_development_dependency 'activerecord'
23
+ s.add_development_dependency 'sqlite3'
24
+
25
+ # Runtime dependencies
26
+ s.add_runtime_dependency "paperclip"
21
27
 
22
28
  s.files = `git ls-files`.split("\n")
23
29
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -1,3 +1,4 @@
1
+ require "docsplit"
1
2
  require "paperclip"
2
3
  require "filemagic"
3
4
 
@@ -17,59 +18,8 @@ module Paperclip
17
18
  File.expand_path(@src.path)
18
19
  end
19
20
  end
21
+ end
20
22
 
21
- class DocsplitChaining < Processor
22
- attr_accessor :options, :attachment
23
-
24
- def initialize(file, options = {}, attachment = nil)
25
- super
26
- @options = options
27
- @attachment = attachment
28
- end
29
-
30
- def make
31
- attachment.to_file(options[:from_style] || :original)
32
- end
33
- end
34
-
35
- class DocsplitPdf < DocsplitProcessor
36
- def make
37
- begin
38
- dst_dir = Dir.tmpdir
39
- dst_path = File.join(dst_dir, "#{@basename}.pdf")
40
-
41
- if pdf_format?
42
- dst_path = File.join(dst_dir, "_#{@basename}.pdf")
43
- FileUtils.copy_file(src_path, dst_path)
44
- else
45
- Docsplit.extract_pdf(src_path, :output => dst_dir)
46
- end
47
- rescue Exception => e
48
- raise Paperclip::Error, "There was an error converting #{@basename} to pdf"
49
- end
50
- File.open(dst_path)
51
- end
52
-
53
- def pdf_format?
54
- file_magic = FileMagic.new
55
- type = file_magic.file(src_path)
56
- file_magic.close
57
- type =~ /pdf/i
58
- end
59
- end
60
-
61
- class DocsplitImage < DocsplitProcessor
62
- def make
63
- begin
64
- dst_path = Dir.tmpdir
65
- pages = options[:pages] || [1]
66
- options = @options.merge(:output => dst_path)
67
-
68
- Docsplit.extract_images(src_path, options)
69
- rescue Exception => e
70
- raise Paperclip::Error, "There was an error extracting images from #{@basename}"
71
- end
72
- File.open(File.join(dst_path, "#{@basename}_#{pages.first}.#{@options[:format]}"))
73
- end
74
- end
75
- end
23
+ require 'processors/docsplit_image'
24
+ require 'processors/docsplit_pdf'
25
+ require 'processors/docsplit_text'
@@ -1,7 +1,7 @@
1
1
  module Docsplit
2
2
  module Paperclip
3
3
  module Processor
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.2"
5
5
  end
6
6
  end
7
7
  end
@@ -0,0 +1,21 @@
1
+ module Paperclip
2
+ class DocsplitImage < DocsplitProcessor
3
+ def make
4
+ begin
5
+ @dst_path = Dir.tmpdir
6
+ @pages = @options[:pages] || [1]
7
+ @options = @options.merge(:output => @dst_path)
8
+
9
+ Docsplit.extract_images(src_path, @options)
10
+ rescue Exception => e
11
+ raise Paperclip::Error, "There was an error extracting images from #{@basename}"
12
+ end
13
+
14
+ destination_file
15
+ end
16
+
17
+ def destination_file
18
+ File.open(File.join(@dst_path, "#{@basename}_#{@pages.first}.#{@options[:format]}"))
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ module Paperclip
2
+ class DocsplitPdf < DocsplitProcessor
3
+ def make
4
+ begin
5
+ dst_dir = Dir.tmpdir
6
+ dst_path = File.join(dst_dir, "#{@basename}.pdf")
7
+
8
+ if pdf_format?
9
+ dst_path = File.join(dst_dir, "_#{@basename}.pdf")
10
+ FileUtils.copy_file(src_path, dst_path)
11
+ else
12
+ Docsplit.extract_pdf(src_path, :output => dst_dir)
13
+ end
14
+ rescue Exception => e
15
+ raise Paperclip::Error, "There was an error converting #{@basename} to pdf"
16
+ end
17
+ File.open(dst_path)
18
+ end
19
+
20
+ def pdf_format?
21
+ file_magic = FileMagic.new
22
+ type = file_magic.file(src_path)
23
+ file_magic.close
24
+ type =~ /pdf/i
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ module Paperclip
2
+ class DocsplitText < DocsplitProcessor
3
+ def make
4
+ begin
5
+ @dst_path = Dir.tmpdir
6
+ @pages = @options[:pages] || [1]
7
+ @options = @options.merge(:output => @dst_path)
8
+
9
+ Docsplit.extract_text(src_path, @options)
10
+ rescue Exception => e
11
+ raise Paperclip::Error, "There was an error extracting text from #{@basename}"
12
+ end
13
+
14
+ if @options[:full_text_column]
15
+ # Bypassing callbacks to save full text. See Paperclip issue #671:
16
+ # https://github.com/thoughtbot/paperclip/issues/671
17
+ ar_model = @attachment.instance
18
+ ar_model[@options[:full_text_column]] = full_text
19
+ ar_model.run_callbacks(:save) { false }
20
+
21
+ # This would be the preferred method of saving this text.
22
+ # @attachment.instance.update_attribute(@options[:full_text_column], full_text)
23
+ end
24
+
25
+ destination_file
26
+ end
27
+
28
+ def destination_file
29
+ File.open(File.join(@dst_path, "#{@basename}.txt"))
30
+ end
31
+
32
+ def full_text
33
+ full_text = String.new
34
+
35
+ destination_file.each do |line|
36
+ full_text += line
37
+ end
38
+
39
+ full_text
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,65 @@
1
+ require 'spec_helper'
2
+
3
+ describe Paperclip::DocsplitText do
4
+ before(:all) do
5
+ @file = File.open("./fixtures/word_xml.docx")
6
+ end
7
+
8
+ after(:all) do
9
+ @file.close
10
+ end
11
+
12
+ context "with no options supplied" do
13
+ before(:all) do
14
+ @options = {}
15
+ @processor = Paperclip::DocsplitText.new(@file, @options)
16
+ end
17
+
18
+ it "#make sends the correct commands to Docsplit" do
19
+ Docsplit.should_receive(:extract_text).with(File.expand_path(@file.path), @options.merge(:output => Dir.tmpdir))
20
+
21
+ @processor.make
22
+ end
23
+
24
+ it "#make returns the text tempfile created by Docsplit" do
25
+ result = @processor.make
26
+
27
+ text = String.new
28
+ result.each do |line|
29
+ text += line
30
+ end
31
+
32
+ text.should eq("This is a test document.\n\n\f")
33
+ end
34
+ end
35
+
36
+ context "with a destination column for extracted text" do
37
+ before(:all) do
38
+ @options = {:full_text_column => :document_full_text}
39
+ @doc = Document.new()
40
+ end
41
+
42
+ after(:all) do
43
+ FileUtils.rm_rf("./spec/tmp", secure: true)
44
+ end
45
+
46
+ it "#make stores the full text in the specified field" do
47
+ @doc.original = @file
48
+ @doc.save!
49
+
50
+ @doc.reload
51
+
52
+ @doc.original_full_text.should eq("This is a test document.\n\n\f")
53
+ end
54
+ end
55
+
56
+ context "when processing fails" do
57
+ it "#make raises an error if the processing was unsuccessful" do
58
+ Dir.stub!(:tmpdir).and_return(:raise)
59
+
60
+ lambda {
61
+ Paperclip::DocsplitText.new(@file, {}).make
62
+ }.should raise_error(Paperclip::Error)
63
+ end
64
+ end
65
+ end
@@ -1,55 +1,47 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Paperclip::DocsplitImage do
4
- def pdf_jpg_images
5
- Dir.entries(Dir.tmpdir).reject{ |x| !(x =~ /twopage_\d.jpg/) }
4
+ def delete_temp_images
5
+ Dir.entries(Dir.tmpdir).reject{ |x| !(x =~ /twopage_\d.jpg/) }.each do |tempfile|
6
+ File.delete(File.join(Dir.tmpdir, tempfile))
7
+ end
8
+ end
9
+
10
+ before(:all) do
11
+ delete_temp_images
12
+ @file = File.open("./fixtures/twopage.pdf")
13
+ end
14
+
15
+ after(:all) do
16
+ delete_temp_images
17
+ @file.close
6
18
  end
7
19
 
8
20
  context "with a valid pdf file attachment" do
9
21
  before(:all) do
10
- pdf_jpg_images.each do |tempfile|
11
- File.delete(File.join(Dir.tmpdir, tempfile))
12
- end
13
-
14
- @file = File.open("./fixtures/twopage.pdf")
15
- @processor = Paperclip::DocsplitImage.new(@file, {:format => :jpg, :size => "50x50"})
16
- @output = @processor.make
22
+ @options = {:format => :jpg, :size => "50x50"}
23
+ @processor = Paperclip::DocsplitImage.new(@file, @options)
17
24
  end
18
25
 
19
- after(:all) do
20
- @file.close
26
+ it "#make sends the correct commands to Docsplit" do
27
+ @processor.stub!(:destination_file)
28
+ Docsplit.should_receive(:extract_images).with(File.expand_path(@file.path), @options.merge(:output => Dir.tmpdir))
29
+
30
+ @processor.make
21
31
  end
22
32
 
23
- it "#make generates an image for each page of the document" do
24
- pdf_jpg_images.count.should eq(2)
25
- end
26
-
27
- it "#make generates images at the specified resolution" do
28
- cmd = %Q[identify -format "%wx%h" "#{@output.path}"]
29
- `#{cmd}`.chomp.should eq("39x50")
30
- end
31
-
32
- it "#make generates images in the specified format" do
33
- pdf_jpg_images.each do |output_file|
34
- FileMagic.new.file(File.join(Dir.tmpdir, output_file)).should =~ /jpeg/i
35
- end
36
- end
37
-
38
33
  it "#make returns the image of the first page" do
39
- File.basename(@output).should eq('twopage_1.jpg')
34
+ @processor.make.path.should eq(File.open(Dir.tmpdir + '/twopage_1.jpg').path)
40
35
  end
41
36
  end
42
37
 
43
38
  context "when processing fails" do
44
39
  it "#make raises an error if the processing was unsuccessful" do
45
- @file = File.open("./fixtures/twopage.pdf")
46
40
  Dir.stub!(:tmpdir).and_return(:raise)
47
41
 
48
42
  lambda {
49
43
  Paperclip::DocsplitImage.new(@file, {:format => :jpg}).make
50
44
  }.should raise_error(Paperclip::Error)
51
-
52
- @file.close
53
45
  end
54
46
  end
55
47
  end
@@ -0,0 +1,10 @@
1
+ ActiveRecord::Schema.define :version => 0 do
2
+ create_table "documents", :force => true do |t|
3
+ t.string :owner
4
+ t.string :original_file_name
5
+ t.string :original_content_type
6
+ t.integer :original_updated_at
7
+ t.integer :original_file_size
8
+ t.text :original_full_text
9
+ end
10
+ end
@@ -1,7 +1,33 @@
1
- require "paperclip"
2
- require "filemagic"
3
- require "docsplit"
4
- require "docsplit-paperclip-processor"
1
+ require 'paperclip'
2
+ require 'paperclip/railtie'
3
+ require 'filemagic'
4
+ require 'docsplit'
5
+ require 'docsplit-paperclip-processor'
5
6
 
6
7
  require 'rspec'
7
- require 'rspec/autorun'
8
+ require 'rspec/autorun'
9
+
10
+ # Prepare activerecord
11
+ require "active_record"
12
+
13
+ # Connect to sqlite
14
+ ActiveRecord::Base.establish_connection(
15
+ "adapter" => "sqlite3",
16
+ "database" => ":memory:"
17
+ )
18
+
19
+ ActiveRecord::Base.logger = Logger.new(nil)
20
+ load(File.join(File.dirname(__FILE__), 'schema.rb'))
21
+
22
+ Paperclip::Railtie.insert
23
+
24
+ class Document < ActiveRecord::Base
25
+ has_attached_file :original,
26
+ :storage => :filesystem,
27
+ :path => "./spec/tmp/:id.:extension",
28
+ :url => "/spec/tmp/:id.:extension",
29
+ :styles => {
30
+ :text => {:full_text_column => :original_full_text}
31
+ },
32
+ :processors => [:docsplit_text]
33
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit-paperclip-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-14 00:00:00.000000000 Z
12
+ date: 2012-08-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: paperclip
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '2.4'
21
+ version: '3.1'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: '2.4'
29
+ version: '3.1'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: ruby-filemagic
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -59,6 +59,22 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.1.4
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.1.4
62
78
  - !ruby/object:Gem::Dependency
63
79
  name: rspec
64
80
  requirement: !ruby/object:Gem::Requirement
@@ -75,6 +91,54 @@ dependencies:
75
91
  - - ! '>='
76
92
  - !ruby/object:Gem::Version
77
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activerecord
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: sqlite3
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: paperclip
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
78
142
  description: This gem is simple Paperclip processor which uses Docsplit to convert
79
143
  uploaded files to pdf, or extract information/thumbnails from them
80
144
  email:
@@ -84,6 +148,7 @@ extensions: []
84
148
  extra_rdoc_files: []
85
149
  files:
86
150
  - .gitignore
151
+ - .rspec
87
152
  - CHANGELOG
88
153
  - Gemfile
89
154
  - README.md
@@ -96,9 +161,14 @@ files:
96
161
  - fixtures/word_xml.docx
97
162
  - lib/docsplit-paperclip-processor.rb
98
163
  - lib/docsplit-paperclip-processor/version.rb
164
+ - lib/processors/docsplit_image.rb
165
+ - lib/processors/docsplit_pdf.rb
166
+ - lib/processors/docsplit_text.rb
99
167
  - spec/docsplit-paperclip-processor_spec.rb
168
+ - spec/docsplit_extract_text_spec.rb
100
169
  - spec/docsplit_image_spec.rb
101
170
  - spec/docsplit_pdf_spec.rb
171
+ - spec/schema.rb
102
172
  - spec/spec_helper.rb
103
173
  homepage: https://github.com/tienle/docsplit-paperclip-processor
104
174
  licenses: []
@@ -126,7 +196,8 @@ specification_version: 3
126
196
  summary: A Paperclip processor for Docsplit
127
197
  test_files:
128
198
  - spec/docsplit-paperclip-processor_spec.rb
199
+ - spec/docsplit_extract_text_spec.rb
129
200
  - spec/docsplit_image_spec.rb
130
201
  - spec/docsplit_pdf_spec.rb
202
+ - spec/schema.rb
131
203
  - spec/spec_helper.rb
132
- has_rdoc: