proselytism 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +94 -0
  5. data/Rakefile +9 -0
  6. data/lib/generators/proselytism/config_generator.rb +14 -0
  7. data/lib/generators/proselytism/initializer_generator.rb +14 -0
  8. data/lib/generators/proselytism/templates/config.yml +29 -0
  9. data/lib/generators/proselytism/templates/initializer.rb +30 -0
  10. data/lib/proselytism.rb +14 -0
  11. data/lib/proselytism/converter.rb +74 -0
  12. data/lib/proselytism/converters/open_office.rb +183 -0
  13. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/ChangeLog.txt +119 -0
  14. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/LICENSE.txt +504 -0
  15. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/README.txt +58 -0
  16. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/jodconverter-2.2.2-javadoc.jar +0 -0
  17. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-commons-io.txt +203 -0
  18. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-openoffice.org.txt +8 -0
  19. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-slf4j.txt +24 -0
  20. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-xstream.txt +27 -0
  21. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/document-formats.xml +513 -0
  22. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/DEPENDENCIES.txt +17 -0
  23. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/commons-cli-1.2.jar +0 -0
  24. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/commons-io-1.4.jar +0 -0
  25. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/jodconverter-2.2.2.jar +0 -0
  26. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/jodconverter-cli-2.2.2.jar +0 -0
  27. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/juh-3.0.1.jar +0 -0
  28. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/jurt-3.0.1.jar +0 -0
  29. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/ridl-3.0.1.jar +0 -0
  30. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/slf4j-api-1.5.6.jar +0 -0
  31. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/slf4j-jdk14-1.5.6.jar +0 -0
  32. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/unoil-3.0.1.jar +0 -0
  33. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/xstream-1.3.1.jar +0 -0
  34. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/src/jodconverter-2.2.2-sources.jar +0 -0
  35. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/src/jodconverter-cli-2.2.2-sources.jar +0 -0
  36. data/lib/proselytism/converters/open_office/odconverters/pyodconverter.py +151 -0
  37. data/lib/proselytism/converters/pdf_images.rb +13 -0
  38. data/lib/proselytism/converters/pdf_to_text.rb +14 -0
  39. data/lib/proselytism/converters/ppm_to_jpeg.rb +14 -0
  40. data/lib/proselytism/engine.rb +25 -0
  41. data/lib/proselytism/proselytism.rb +90 -0
  42. data/lib/proselytism/shared.rb +22 -0
  43. data/lib/proselytism/version.rb +3 -0
  44. data/proselytism.gemspec +25 -0
  45. data/spec/.DS_Store +0 -0
  46. data/spec/base_converter_spec.rb +15 -0
  47. data/spec/fixtures/001.doc +0 -0
  48. data/spec/fixtures/001.pdf +0 -0
  49. data/spec/fixtures/001.txt +63 -0
  50. data/spec/fixtures/002.doc +0 -0
  51. data/spec/fixtures/003.docx +0 -0
  52. data/spec/fixtures/004-fake.pdf +0 -0
  53. data/spec/fixtures/005-fake.doc +0 -0
  54. data/spec/open_office_spec.rb +83 -0
  55. data/spec/pdf_images_spec.rb +19 -0
  56. data/spec/pdf_to_text_spec.rb +19 -0
  57. data/spec/proselytism_spec.rb +106 -0
  58. data/spec/shared_spec.rb +26 -0
  59. data/spec/spec_helper.rb +48 -0
  60. metadata +195 -0
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/tmp
16
+ test/version_tmp
17
+ .idea
18
+ spec/tmp/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in proselytism.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Itkin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,94 @@
1
+ # Proselytism
2
+
3
+ Document converter, text and image extractor using OpenOffice headless server, pdf_tools and net_pbm
4
+
5
+ ## Note
6
+
7
+ This gem has been originally written for as a RoR 3.2 engine running on Ruby 1.8.7.
8
+ It should be framework agnostic and has been tested on Ubuntu and MacOSX.
9
+
10
+ Due to its dependency to system_timer it doesn't work with ruby 1.9.x
11
+
12
+ ## Installation
13
+
14
+ Install the required external librairies :
15
+
16
+ # aptitude install netpbm
17
+ # aptitude install xpdf
18
+ # aptitude install libreoffice
19
+
20
+ Add this line to your application's Gemfile:
21
+
22
+ gem 'proselytism', :git => "git://github.com/itkin/proselytism.git"
23
+
24
+ And then execute:
25
+
26
+ $ bundle
27
+
28
+ Generate the config file or / and an initializer
29
+
30
+ $ rails g proselytism:config
31
+ $ rails g proselytism:initializer
32
+
33
+ As an engine, Proselytism automatically load and autoconfig with /config/proselytism.yml if it exists
34
+ You can override these configurations params with an initializer. This is especially usefull when you want a custom log file
35
+
36
+ ```ruby
37
+ #/config/initializers/proselytism.rb
38
+ Proselytism.config do |config|
39
+ config.logger = ActiveSupport::BufferedLogger.new(File.join(Rails.root, 'log', 'proselytism.log'))
40
+ end
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ```ruby
46
+ Proselytism.convert source_file_path, :to => :pdf do |converted_file_path|
47
+
48
+ end
49
+ Proselytism.extract_text source_file_path do |extracted_text|
50
+
51
+ end
52
+ Proselytism.extract_images source_file_path do |image_files_paths|
53
+
54
+ end
55
+ ```
56
+
57
+ Proselytism create its converted files in temporary folders.
58
+ - If you pass a block to the method the folders are automatically deleted after the block is yield, so use or copy the file content within the block
59
+ - If you don't pass a block, don't forget to safely remove the temp folder
60
+
61
+ ```ruby
62
+ pdf_file_path = Proselytism.convert source_file_path, :to => :pdf
63
+ FileUtils.remove_entry_secure File.dirname(pdf_file_path)
64
+ ```
65
+
66
+ ## Add your own converter
67
+
68
+ Add your own converter by extending Proselytism::Converters::Base
69
+ - Your converter will be automatically selected and used related to the form and to extensions list
70
+ - Add a perform method which
71
+ - define a text command
72
+ - call execute
73
+ - return the converted file(s) path
74
+
75
+ ```ruby
76
+ class MyConverter < Proselytism::Converters::Base
77
+ form :ext1, :ext2
78
+ to :ext3, :ext4
79
+
80
+ def perform(origin, options={})
81
+ destination = destination_file_path(origin, options)
82
+ command = "pdftotext #{origin} #{destination} 2>&1"
83
+ execute command
84
+ destination
85
+ end
86
+ end
87
+ ```
88
+ ## Contributing
89
+
90
+ 1. Fork it
91
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
92
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
93
+ 4. Push to the branch (`git push origin my-new-feature`)
94
+ 5. Create new Pull Request
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require "rspec/core/rake_task"
4
+
5
+ RSpec::Core::RakeTask.new
6
+
7
+ task :default => :spec
8
+ task :test => :spec
9
+
@@ -0,0 +1,14 @@
1
+ require 'rails/generators'
2
+
3
+ module Proselytism
4
+ class ConfigGenerator < Rails::Generators::Base
5
+
6
+ self.source_paths << File.join(File.dirname(__FILE__), 'templates')
7
+
8
+ desc "Generate Proselytism config file load before rails initialize"
9
+ def create_config_file
10
+ template 'config.yml', 'config/proselytism.yml'
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ require 'rails/generators'
2
+
3
+ module Proselytism
4
+ class InitializerGenerator < Rails::Generators::Base
5
+
6
+ self.source_paths << File.join(File.dirname(__FILE__), 'templates')
7
+
8
+
9
+ desc "Generate an initializer file to override the configuration file"
10
+ def create_initializer_file
11
+ template 'initializer.rb', 'config/initializers/proselytism.rb'
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,29 @@
1
+ test:
2
+ open_office_path: "/Applications/OpenOffice.org.app/Contents/MacOS/soffice"
3
+ oo_server_bridge: "JOD"
4
+ oo_server_max_cpu: 95 #percent
5
+ oo_server_max_cpu_delay: 2 #seconds
6
+ oo_server_availability_delay: 6 # seconds
7
+ oo_server_start_delay: 2 #seconds
8
+ oo_conversion_max_tries: 2
9
+ oo_conversion_max_time: 4 #seconds
10
+
11
+ development:
12
+ open_office_path: "/Applications/OpenOffice.org.app/Contents/MacOS/soffice"
13
+ oo_server_bridge: "JOD"
14
+ oo_server_max_cpu: 95 #percent
15
+ oo_server_max_cpu_delay: 2 #seconds
16
+ oo_server_availability_delay: 6 # seconds
17
+ oo_server_start_delay: 2 #seconds
18
+ oo_conversion_max_tries: 2
19
+ oo_conversion_max_time: 4 #seconds
20
+
21
+ production:
22
+ open_office_path: "/usr/bin/soffice"
23
+ oo_server_bridge: "PYOD"
24
+ oo_server_max_cpu: 98 #percent
25
+ oo_server_max_cpu_delay: 2 #seconds
26
+ oo_server_availability_delay: 7 #seconds
27
+ oo_server_start_delay: 2 #seconds
28
+ oo_conversion_max_tries: 2
29
+ oo_conversion_max_time: 5 #seconds
@@ -0,0 +1,30 @@
1
+ Proselytism.config do |config|
2
+ #Open Office binary path /usr/bin/soffice for unix or /Applications/OpenOffice.org.app/Contents/MacOS/soffice for mac OSX
3
+ config.open_office_path = "/Applications/OpenOffice.org.app/Contents/MacOS/soffice"
4
+
5
+ #Bridge PYOD or JOD (PYOD dones't work as is on mac)
6
+ config.oo_server_bridge = "JOD"
7
+
8
+ #When ensuring server availability (before converting a doc),
9
+ #Restart the server if all processs are above max cpu during max_cpu_delay
10
+ config.oo_server_max_cpu = 95 #percent
11
+ config.oo_server_max_cpu_delay = 2 #seconds
12
+
13
+ #Time the server waits for availability before converting a doc
14
+ config.oo_server_availability_delay = 6 # seconds
15
+
16
+ # Wait time after server start
17
+ config.oo_server_start_delay = 2 #seconds
18
+
19
+
20
+ config.oo_conversion_max_tries = 2
21
+
22
+ #max time for performing a conversion (then restart an attempt)
23
+ config.oo_conversion_max_time = 5 #seconds
24
+
25
+ #Path where conversion are done by default system temp dir
26
+ #config.tmp_path = File.expand_path("../tmp", __FILE__)
27
+
28
+ #Logger (otherwhise rails logger)
29
+ #config.logger = ActiveSupport::BufferedLogger.new("your/log/path")
30
+ end
@@ -0,0 +1,14 @@
1
+ require "active_support/core_ext"
2
+
3
+ require "proselytism/version"
4
+ require "proselytism/shared"
5
+ require "proselytism/proselytism"
6
+ require "proselytism/converter"
7
+
8
+ require "proselytism/converters/open_office"
9
+ require "proselytism/converters/pdf_to_text"
10
+ require "proselytism/converters/pdf_images"
11
+ require "proselytism/converters/ppm_to_jpeg"
12
+
13
+
14
+ require "proselytism/engine" if defined? Rails
@@ -0,0 +1,74 @@
1
+ require 'singleton'
2
+ require 'active_support/core_ext/class/attribute'
3
+ require 'active_support/core_ext/module/aliasing'
4
+
5
+ module Proselytism
6
+ module Converters
7
+ class Base
8
+ include ::Singleton
9
+ include Proselytism::Shared
10
+ class_attribute :from, :to, :subclasses
11
+
12
+ class Error < Exception; end
13
+
14
+ def config
15
+ Proselytism.config
16
+ end
17
+
18
+ def destination_file_path(origin, options={})
19
+ if options[:dest]
20
+ options[:dest]
21
+ else
22
+ File.join options[:dir], File.basename(origin).gsub(/\..*$/, options[:folder] ? '' : ".#{options[:to]}")
23
+ end
24
+ end
25
+
26
+ #call perform logging duration and potential errors
27
+ def convert(file_path, options={})
28
+ log :debug, "convert #{file_path} to :#{options[:to]}" do
29
+ begin
30
+ perform(file_path, options)
31
+ rescue Error => e
32
+ log :error, e.message
33
+ raise e
34
+ end
35
+ end
36
+ end
37
+
38
+ #execute a command and raise error with the command output if it fails
39
+ def execute(command)
40
+ output = `#{command}`
41
+ if $?.exitstatus != 0
42
+ raise self.class::Error, ["#{self.class.name} unable to exec command: #{command}",'--', output,'--'].join("\n")
43
+ end
44
+ $?.exitstatus == 0
45
+ end
46
+
47
+ singleton_class.class_eval do
48
+
49
+ def inherited_with_registering(subclass)
50
+ self.subclasses ||= []
51
+ self.subclasses << subclass
52
+ inherited_without_registering(subclass)
53
+ subclass
54
+ end
55
+
56
+ alias_method_chain :inherited, :registering
57
+
58
+ [:from, :to].each do |attr|
59
+ define_method "#{attr}_with_default" do |*formats|
60
+ if formats.length
61
+ self.send "#{attr}=", formats.map(&:to_sym)
62
+ else
63
+ self.send "#{attr}_without_default" || []
64
+ end
65
+ end
66
+ alias_method_chain attr, :default
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+ end
74
+
@@ -0,0 +1,183 @@
1
+ require 'system_timer'
2
+
3
+ class Proselytism::Converters::OpenOffice < Proselytism::Converters::Base
4
+
5
+ class Error < parent::Base::Error; end
6
+
7
+ from :odt, :doc, :rtf, :sxw, :docx, :txt, :html, :htm, :wps
8
+ to :odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps
9
+
10
+ module Bridges
11
+ module JOD
12
+ def self.command
13
+ "java -jar #{File.expand_path('open_office/odconverters/jodconverter-2.2.2/lib/jodconverter-cli-2.2.2.jar', File.dirname(__FILE__))}"
14
+ end
15
+ end
16
+ module PYOD
17
+ def self.command
18
+ "python #{File.expand_path('open_office/odconverters/pyodconverter.py', File.dirname(__FILE__))}"
19
+ end
20
+ end
21
+ end
22
+
23
+ # Converts documents
24
+ def perform(origin, options={})
25
+ destination = destination_file_path(origin, options)
26
+ command = "#{Proselytism::Converters::OpenOffice}::Bridges::#{config.oo_server_bridge}".constantize.command + " '#{origin}' '#{destination}' 2>&1"
27
+ server.perform { execute(command) }
28
+ destination
29
+ end
30
+
31
+
32
+ # HACK pour contourner un comportement ?trange d'OpenOffice, normalement les enregistrements
33
+ # se font en UTF-8, mais parfois pour une raison obscure les fichiers texte sont en ISO-8859-1
34
+ # donc on rajoute un test pour re-convertir dans l'encodage qu'on attend
35
+ def convert_txt_to_utf8(file_path)
36
+ if `file #{file_path}` =~ /ISO/
37
+ system("iconv --from-code ISO-8859-1 --to-code UTF-8 #{file_path} > tmp_iconv.txt && mv tmp_iconv.txt #{file_path}")
38
+ end
39
+ end
40
+
41
+ def server
42
+ Server.instance
43
+ end
44
+
45
+
46
+ class Server
47
+ include Singleton
48
+ include Proselytism::Shared
49
+ class Error < Proselytism::Converters::OpenOffice::Error; end
50
+
51
+ def config
52
+ Proselytism.config
53
+ end
54
+
55
+ # Run a block with a timeout and retry if the first execution fails
56
+ def perform(&block)
57
+ attempts = 1
58
+ begin
59
+ ensure_available
60
+ Timeout::timeout(config.oo_conversion_max_time,&block)
61
+ rescue Timeout::Error, Proselytism::Converters::OpenOffice::Error
62
+ attempts += 1
63
+ restart!
64
+ retry unless attempts > config.oo_conversion_max_tries
65
+ raise Error, "OpenOffice server perform timeout"
66
+ end
67
+ end
68
+
69
+ # Restart if running or start new instance
70
+ def restart!
71
+ stop! if running?
72
+ start!
73
+ end
74
+
75
+ # Start new instance
76
+ def start!
77
+ log :debug, "OpenOffice server started" do
78
+ system "#{config.open_office_path} -headless -accept=\"socket,host=127.0.0.1,port=8100\;urp\;\" -nofirststartwizard -nologo -nocrashreport -norestore -nolockcheck -nodefault &"
79
+ begin
80
+ SystemTimer.timeout_after(3) do
81
+ while !running?
82
+ log :debug, ". Waiting OpenOffice server to run"
83
+ sleep(0.1)
84
+ end
85
+ end
86
+ rescue
87
+ raise Error, "Could not start OpenOffice"
88
+ end
89
+ # OpenOffice needs some time to wake up
90
+ sleep(config.oo_server_start_delay)
91
+ end
92
+ nil
93
+ end
94
+
95
+ def start_with_running_control!
96
+ if running?
97
+ log :debug, "OpenOffice server is allready running"
98
+ else
99
+ start_without_running_control!
100
+ end
101
+ end
102
+ alias_method_chain :start!, :running_control
103
+
104
+ # Kill running instance
105
+ def stop!
106
+ #operating_system = `uname -s`
107
+ #command = "killall -u `whoami` -#{operating_system == "Linux" ? 'q' : 'm'} soffice"
108
+ begin
109
+ Timeout::timeout(3) do
110
+ loop do
111
+ system("killall -9 soffice && killall -9 soffice.bin > /dev/null 2>&1")
112
+ break unless running?
113
+ sleep(0.2)
114
+ end
115
+ end
116
+ rescue Timeout::Error
117
+ raise Error, "Could not kill OpenOffice !!"
118
+ ensure
119
+ # Remove user profile
120
+ system("rm -rf ~/openoffice.org*")
121
+ log :debug, "OpenOffice server stopped"
122
+ end
123
+ end
124
+
125
+ def stop_with_running_control!
126
+ if !running?
127
+ log :debug, "OpenOffice server is allready stoped"
128
+ else
129
+ stop_without_running_control!
130
+ end
131
+ end
132
+ alias_method_chain :stop!, :running_control
133
+
134
+ # Is OpenOffice server running?
135
+ def running?
136
+ !`pgrep soffice`.blank?
137
+ end
138
+
139
+
140
+ # Is the current instance stuck ?
141
+ def stalled?
142
+ begin
143
+ SystemTimer.timeout_after config.oo_server_max_cpu_delay do
144
+ loop do
145
+ cpu_usage = `ps -Ao pcpu,pid,comm= | grep soffice`.split(/\n/).map{|usage| /^\s*\d+/.match(usage)[0].strip.to_i}
146
+ break unless cpu_usage.all?{|usage| usage > config.oo_server_max_cpu }
147
+ sleep(0.2)
148
+ end
149
+ end
150
+ false
151
+ rescue
152
+ log :error, "OpenOffice server stalled : \n---\n" + `ps -Ao pcpu,pid,comm | grep soffice` + "\n---"
153
+ true
154
+ end
155
+ end
156
+
157
+ def available?
158
+ `ps -o pid,stat,command |grep soffice`.match(/\d+\s(\w)/i)[1] == "S"
159
+ end
160
+
161
+ # Make sure there will be an available instance
162
+ def ensure_available
163
+ start! unless running?
164
+ restart! if stalled?
165
+ begin
166
+ SystemTimer.timeout_after config.oo_server_availability_delay do
167
+ while !available?
168
+ log :debug, ". Waiting OpenOffice server availability"
169
+ sleep(0.5)
170
+ end
171
+ end
172
+ rescue Timeout::Error
173
+ raise Error, "OpenOffice Server unavailable"
174
+ end
175
+ true
176
+ end
177
+
178
+ end
179
+
180
+
181
+
182
+ end
183
+