paperclip-document 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *~
2
+ *.gem
3
+ *.rbc
4
+ .bundle
5
+ .config
6
+ .yardoc
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in paperclip-document.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Brice Texier
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Paperclip::Document
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'paperclip-document'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install paperclip-document
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+ Rake::TestTask.new(:test) do |test|
5
+ test.libs << 'lib' << 'test'
6
+ test.pattern = 'test/**/test_*.rb'
7
+ test.verbose = true
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,44 @@
1
+ require "paperclip/document/version"
2
+ require "docsplit"
3
+ require "pathname"
4
+
5
+ module Paperclip
6
+ # Main processor
7
+ class DocumentProcessor < Processor
8
+
9
+ attr_reader :instance, :tmp_dir
10
+
11
+ def initialize(file, options = {}, attachment = nil)
12
+ super(file, options, attachment)
13
+ @instance = @attachment.instance
14
+ @tmp_dir = Pathname.new(Dir.tmpdir).join("paperclip-document", instance.class.name, attachment.name.to_s, basename, Time.now.to_i.to_s(36) + "-" + rand(1_000_000).to_s(36))
15
+ end
16
+
17
+ def file_path
18
+ Pathname.new(@file.path)
19
+ end
20
+
21
+ def basename
22
+ file_path.basename.to_s.gsub(/\.[^\.]+/, '')
23
+ end
24
+
25
+ end
26
+
27
+ class Attachment
28
+
29
+ # Returns the content_text of the file as originally extracted, and lives in the <attachment>_content_text attribute of the model.
30
+ def content_text
31
+ instance_read(:content_text)
32
+ end
33
+
34
+ # Returns the pages_count of the file as originally computed, and lives in the <attachment>_pages_count attribute of the model.
35
+ def pages_count
36
+ instance_read(:pages_count)
37
+ end
38
+
39
+ end
40
+
41
+
42
+ end
43
+
44
+ require "paperclip/document/processors"
@@ -0,0 +1,4 @@
1
+ require "paperclip/document/processors/sketcher"
2
+ require "paperclip/document/processors/reader"
3
+ require "paperclip/document/processors/freezer"
4
+ require "paperclip/document/processors/counter"
@@ -0,0 +1,45 @@
1
+ module Paperclip
2
+
3
+ # This processor extract the OCR text of the file
4
+ class Counter < DocumentProcessor
5
+
6
+ attr_accessor :pages_count_column
7
+
8
+ def initialize(file, options = {}, attachment = nil)
9
+ super(file, options, attachment)
10
+ if @options[:pages_count_column].nil? and pages_count_column?
11
+ @options[:pages_count_column] = default_pages_count_column
12
+ end
13
+ @pages_count_column = @options[:pages_count_column]
14
+
15
+ unless @pages_count_column
16
+ raise Paperclip::Error, "No pages count column given"
17
+ end
18
+ end
19
+
20
+ # Extract the pages count of all the document
21
+ def make
22
+ count = Docsplit.extract_length(file_path.to_s)
23
+
24
+ instance[pages_count_column] = count
25
+ instance.run_callbacks(:save) { false }
26
+
27
+ return file
28
+ end
29
+
30
+ # Check if a pages count column is present
31
+ def pages_count_column?
32
+ expected_column = default_pages_count_column
33
+ return @attachment.instance.class.columns.detect do |column|
34
+ column.name.to_s == expected_column
35
+ end
36
+ end
37
+
38
+ # Returns the name of the default pages count column
39
+ def default_pages_count_column
40
+ @attachment.name.to_s + "_pages_count"
41
+ end
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,38 @@
1
+ require 'filemagic'
2
+
3
+ module Paperclip
4
+
5
+ # This processor extract first page as thumbnail
6
+ class Freezer < DocumentProcessor
7
+ def initialize(file, options = {}, attachment = nil)
8
+ super
9
+ @format = options[:format]
10
+ unless @format == :pdf
11
+ raise Paperclip::Error, "Valid format (pdf) must be specified"
12
+ end
13
+ end
14
+
15
+ # Convert the document to pdf
16
+ def make
17
+ destination_path = tmp_dir.to_s
18
+ destination_file = File.join(destination_path, basename + ".#{@format}")
19
+ if pdf_format?
20
+ destination_file = file_path.to_s
21
+ else
22
+ Docsplit.extract_pdf(file_path.to_s, :output => destination_path)
23
+ end
24
+ return File.open(destination_file)
25
+ end
26
+
27
+
28
+ def pdf_format?
29
+ file_magic = FileMagic.new
30
+ type = file_magic.file(file_path.to_s)
31
+ file_magic.close
32
+ type =~ /pdf/i
33
+ end
34
+
35
+ end
36
+
37
+
38
+ end
@@ -0,0 +1,53 @@
1
+ module Paperclip
2
+
3
+ # This processor extract the OCR text of the file
4
+ class Reader < DocumentProcessor
5
+
6
+ attr_accessor :clean, :text_column, :language
7
+
8
+ def initialize(file, options = {}, attachment = nil)
9
+ super(file, options, attachment)
10
+ if @options[:text_column].nil? and text_column?
11
+ @options[:text_column] = default_text_column
12
+ end
13
+ @language = @options[:language]
14
+ @text_column = @options[:text_column]
15
+ unless @text_column
16
+ raise Paperclip::Error, "No content text column given"
17
+ end
18
+ @clean = !!(options.has_key?(:clean) ? options[:clean] : true)
19
+ end
20
+
21
+ # Extract the text of all the document
22
+ def make
23
+ destination_path = tmp_dir.to_s
24
+ options = {:output => destination_path, :clean => @clean}
25
+ options[:language] = (language.is_a?(Proc) ? language.call(attachment.instance) : language)
26
+ Docsplit.extract_text(file_path.to_s, options)
27
+
28
+ destination_file = File.join(destination_path, basename + ".txt")
29
+ instance = @attachment.instance
30
+ f = File.open(destination_file)
31
+ instance[text_column] = f.read
32
+ instance.run_callbacks(:save) { false }
33
+ f.close
34
+
35
+ return file
36
+ end
37
+
38
+ # Check if the default text column is present
39
+ def text_column?
40
+ expected_column = default_text_column
41
+ return instance.class.columns.detect do |column|
42
+ column.name.to_s == expected_column
43
+ end
44
+ end
45
+
46
+ # Returns the name of the default text column
47
+ def default_text_column
48
+ @attachment.name.to_s + "_content_text"
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,39 @@
1
+ module Paperclip
2
+
3
+ # This processor extract first page as thumbnail
4
+ class Sketcher < DocumentProcessor
5
+
6
+ attr_accessor :format, :density, :format
7
+
8
+ def initialize(file, options = {}, attachment = nil)
9
+ super(file, options, attachment)
10
+ @format = (options[:format] || :jpg).to_sym
11
+ unless [:jpg, :png].include?(@format)
12
+ raise Paperclip::Error, "Valid format must be specified"
13
+ end
14
+ unless @size = options[:size]
15
+ @density = (options[:density] || 150).to_f
16
+ end
17
+ end
18
+
19
+ # Extract the page
20
+ def make
21
+ destination_path = tmp_dir.to_s
22
+ options = {:output => destination_path, :pages => [1], :format => [@format]}
23
+ if @size
24
+ options[:size] = @size
25
+ elsif @density
26
+ options[:density] = @density
27
+ end
28
+ Docsplit.extract_images(file_path.to_s, options)
29
+ begin
30
+ rescue
31
+ raise Paperclip::Error, "There was an error extracting the first thumbnail from #{basename}"
32
+ end
33
+ return File.open(File.join(destination_path, basename + "_1.#{@format}"))
34
+ end
35
+
36
+ end
37
+
38
+
39
+ end
@@ -0,0 +1,5 @@
1
+ module Paperclip
2
+ module Document
3
+ VERSION = "0.0.0"
4
+ end
5
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'paperclip/document/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "paperclip-document"
8
+ spec.version = Paperclip::Document::VERSION
9
+ spec.authors = ["Brice Texier"]
10
+ spec.email = ["burisu@oneiros.fr"]
11
+ spec.summary = %q{Processors for paperclip}
12
+ spec.homepage = "http://github.com/burisu/paperclip-document"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_dependency "paperclip", "~> 3.1"
21
+ spec.add_dependency "docsplit", "~> 0.7.2"
22
+ spec.add_dependency "ruby-filemagic", "~> 0.4.2"
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency 'activerecord', "~> 3.2"
26
+ spec.add_development_dependency 'sqlite3'
27
+ end
Binary file
Binary file
Binary file
data/test/helper.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'paperclip'
2
+ require 'paperclip/railtie'
3
+ require 'active_record'
4
+ require 'pathname'
5
+ require 'paperclip/document'
6
+ require 'test/unit'
7
+
8
+ # Connect to sqlite
9
+ ActiveRecord::Base.establish_connection(
10
+ "adapter" => "sqlite3",
11
+ "database" => ":memory:"
12
+ )
13
+
14
+ ActiveRecord::Base.logger = Logger.new(nil)
15
+ load(File.join(File.dirname(__FILE__), 'schema.rb'))
16
+
17
+ Paperclip::Railtie.insert
18
+
19
+ class Document < ActiveRecord::Base
20
+ has_attached_file(:original,
21
+ :storage => :filesystem,
22
+ :path => "./tmp/documents/:id/:style.:extension",
23
+ :url => "/tmp/:id.:extension",
24
+ :styles => {
25
+ :archive => {:clean => true, :format =>:pdf, :processors => [:reader, :counter, :freezer]},
26
+ :thumbnail => {:processors => [:sketcher], :format => :jpg}
27
+ })
28
+ end
29
+
30
+
31
+
32
+ class Paperclip::Document::TestCase < Test::Unit::TestCase
33
+
34
+ def fixtures
35
+ Pathname.new(__FILE__).dirname.join("fixtures")
36
+ end
37
+ end
data/test/schema.rb ADDED
@@ -0,0 +1,11 @@
1
+ ActiveRecord::Schema.define :version => "001" do
2
+ create_table "documents", :force => true do |t|
3
+ t.string :name
4
+ t.string :original_file_name
5
+ t.string :original_content_type
6
+ t.integer :original_updated_at
7
+ t.integer :original_file_size
8
+ t.text :original_content_text
9
+ t.integer :original_pages_count
10
+ end
11
+ end
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ class TestProcessors < Paperclip::Document::TestCase
5
+
6
+ def test_odt
7
+ f = File.open(fixtures.join("example.odt"))
8
+ document = Document.create!(:name => "My first document", :original => f)
9
+ f.close
10
+
11
+ document.reload
12
+ assert_equal 1, document.original.pages_count
13
+
14
+ assert !document.original.content_text.nil?
15
+ assert document.original.content_text.match("This is an example.")
16
+ end
17
+
18
+ def test_pdf
19
+ f = File.open(fixtures.join("example.pdf"))
20
+ document = Document.create!(:name => "My second document", :original => f)
21
+ f.close
22
+
23
+ document.reload
24
+ assert_equal 1, document.original.pages_count
25
+
26
+ assert !document.original.content_text.nil?
27
+ assert document.original.content_text.match("This is an example.")
28
+ end
29
+
30
+ def test_docx
31
+ f = File.open(fixtures.join("example.docx"))
32
+ document = Document.create!(:name => "My third document", :original => f)
33
+ f.close
34
+
35
+ document.reload
36
+ assert_equal 1, document.original.pages_count
37
+
38
+ assert !document.original.content_text.nil?
39
+ assert document.original.content_text.match("This is an example.")
40
+ end
41
+
42
+ end
metadata ADDED
@@ -0,0 +1,184 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: paperclip-document
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brice Texier
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: paperclip
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '3.1'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '3.1'
30
+ - !ruby/object:Gem::Dependency
31
+ name: docsplit
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 0.7.2
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.7.2
46
+ - !ruby/object:Gem::Dependency
47
+ name: ruby-filemagic
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 0.4.2
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.2
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '1.3'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: '1.3'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rake
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activerecord
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: '3.2'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: '3.2'
110
+ - !ruby/object:Gem::Dependency
111
+ name: sqlite3
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ description:
127
+ email:
128
+ - burisu@oneiros.fr
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - .gitignore
134
+ - Gemfile
135
+ - LICENSE.txt
136
+ - README.md
137
+ - Rakefile
138
+ - lib/paperclip/document.rb
139
+ - lib/paperclip/document/processors.rb
140
+ - lib/paperclip/document/processors/counter.rb
141
+ - lib/paperclip/document/processors/freezer.rb
142
+ - lib/paperclip/document/processors/reader.rb
143
+ - lib/paperclip/document/processors/sketcher.rb
144
+ - lib/paperclip/document/version.rb
145
+ - paperclip-document.gemspec
146
+ - test/fixtures/example.docx
147
+ - test/fixtures/example.odt
148
+ - test/fixtures/example.pdf
149
+ - test/helper.rb
150
+ - test/schema.rb
151
+ - test/test_processors.rb
152
+ homepage: http://github.com/burisu/paperclip-document
153
+ licenses:
154
+ - MIT
155
+ post_install_message:
156
+ rdoc_options: []
157
+ require_paths:
158
+ - lib
159
+ required_ruby_version: !ruby/object:Gem::Requirement
160
+ none: false
161
+ requirements:
162
+ - - ! '>='
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ none: false
167
+ requirements:
168
+ - - ! '>='
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ requirements: []
172
+ rubyforge_project:
173
+ rubygems_version: 1.8.23
174
+ signing_key:
175
+ specification_version: 3
176
+ summary: Processors for paperclip
177
+ test_files:
178
+ - test/fixtures/example.docx
179
+ - test/fixtures/example.odt
180
+ - test/fixtures/example.pdf
181
+ - test/helper.rb
182
+ - test/schema.rb
183
+ - test/test_processors.rb
184
+ has_rdoc: