proselytism 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +94 -0
  5. data/Rakefile +9 -0
  6. data/lib/generators/proselytism/config_generator.rb +14 -0
  7. data/lib/generators/proselytism/initializer_generator.rb +14 -0
  8. data/lib/generators/proselytism/templates/config.yml +29 -0
  9. data/lib/generators/proselytism/templates/initializer.rb +30 -0
  10. data/lib/proselytism.rb +14 -0
  11. data/lib/proselytism/converter.rb +74 -0
  12. data/lib/proselytism/converters/open_office.rb +183 -0
  13. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/ChangeLog.txt +119 -0
  14. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/LICENSE.txt +504 -0
  15. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/README.txt +58 -0
  16. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/jodconverter-2.2.2-javadoc.jar +0 -0
  17. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-commons-io.txt +203 -0
  18. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-openoffice.org.txt +8 -0
  19. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-slf4j.txt +24 -0
  20. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/docs/third-party-licenses/license-xstream.txt +27 -0
  21. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/document-formats.xml +513 -0
  22. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/DEPENDENCIES.txt +17 -0
  23. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/commons-cli-1.2.jar +0 -0
  24. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/commons-io-1.4.jar +0 -0
  25. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/jodconverter-2.2.2.jar +0 -0
  26. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/jodconverter-cli-2.2.2.jar +0 -0
  27. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/juh-3.0.1.jar +0 -0
  28. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/jurt-3.0.1.jar +0 -0
  29. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/ridl-3.0.1.jar +0 -0
  30. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/slf4j-api-1.5.6.jar +0 -0
  31. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/slf4j-jdk14-1.5.6.jar +0 -0
  32. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/unoil-3.0.1.jar +0 -0
  33. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/lib/xstream-1.3.1.jar +0 -0
  34. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/src/jodconverter-2.2.2-sources.jar +0 -0
  35. data/lib/proselytism/converters/open_office/odconverters/jodconverter-2.2.2/src/jodconverter-cli-2.2.2-sources.jar +0 -0
  36. data/lib/proselytism/converters/open_office/odconverters/pyodconverter.py +151 -0
  37. data/lib/proselytism/converters/pdf_images.rb +13 -0
  38. data/lib/proselytism/converters/pdf_to_text.rb +14 -0
  39. data/lib/proselytism/converters/ppm_to_jpeg.rb +14 -0
  40. data/lib/proselytism/engine.rb +25 -0
  41. data/lib/proselytism/proselytism.rb +90 -0
  42. data/lib/proselytism/shared.rb +22 -0
  43. data/lib/proselytism/version.rb +3 -0
  44. data/proselytism.gemspec +25 -0
  45. data/spec/.DS_Store +0 -0
  46. data/spec/base_converter_spec.rb +15 -0
  47. data/spec/fixtures/001.doc +0 -0
  48. data/spec/fixtures/001.pdf +0 -0
  49. data/spec/fixtures/001.txt +63 -0
  50. data/spec/fixtures/002.doc +0 -0
  51. data/spec/fixtures/003.docx +0 -0
  52. data/spec/fixtures/004-fake.pdf +0 -0
  53. data/spec/fixtures/005-fake.doc +0 -0
  54. data/spec/open_office_spec.rb +83 -0
  55. data/spec/pdf_images_spec.rb +19 -0
  56. data/spec/pdf_to_text_spec.rb +19 -0
  57. data/spec/proselytism_spec.rb +106 -0
  58. data/spec/shared_spec.rb +26 -0
  59. data/spec/spec_helper.rb +48 -0
  60. metadata +195 -0
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/tmp
16
+ test/version_tmp
17
+ .idea
18
+ spec/tmp/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in proselytism.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Itkin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,94 @@
1
+ # Proselytism
2
+
3
+ Document converter, text and image extractor using OpenOffice headless server, pdf_tools and net_pbm
4
+
5
+ ## Note
6
+
7
+ This gem has been originally written for as a RoR 3.2 engine running on Ruby 1.8.7.
8
+ It should be framework agnostic and has been tested on Ubuntu and MacOSX.
9
+
10
+ Due to its dependency to system_timer it doesn't work with ruby 1.9.x
11
+
12
+ ## Installation
13
+
14
+ Install the required external librairies :
15
+
16
+ # aptitude install netpbm
17
+ # aptitude install xpdf
18
+ # aptitude install libreoffice
19
+
20
+ Add this line to your application's Gemfile:
21
+
22
+ gem 'proselytism', :git => "git://github.com/itkin/proselytism.git"
23
+
24
+ And then execute:
25
+
26
+ $ bundle
27
+
28
+ Generate the config file or / and an initializer
29
+
30
+ $ rails g proselytism:config
31
+ $ rails g proselytism:initializer
32
+
33
+ As an engine, Proselytism automatically load and autoconfig with /config/proselytism.yml if it exists
34
+ You can override these configurations params with an initializer. This is especially usefull when you want a custom log file
35
+
36
+ ```ruby
37
+ #/config/initializers/proselytism.rb
38
+ Proselytism.config do |config|
39
+ config.logger = ActiveSupport::BufferedLogger.new(File.join(Rails.root, 'log', 'proselytism.log'))
40
+ end
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ```ruby
46
+ Proselytism.convert source_file_path, :to => :pdf do |converted_file_path|
47
+
48
+ end
49
+ Proselytism.extract_text source_file_path do |extracted_text|
50
+
51
+ end
52
+ Proselytism.extract_images source_file_path do |image_files_paths|
53
+
54
+ end
55
+ ```
56
+
57
+ Proselytism create its converted files in temporary folders.
58
+ - If you pass a block to the method the folders are automatically deleted after the block is yield, so use or copy the file content within the block
59
+ - If you don't pass a block, don't forget to safely remove the temp folder
60
+
61
+ ```ruby
62
+ pdf_file_path = Proselytism.convert source_file_path, :to => :pdf
63
+ FileUtils.remove_entry_secure File.dirname(pdf_file_path)
64
+ ```
65
+
66
+ ## Add your own converter
67
+
68
+ Add your own converter by extending Proselytism::Converters::Base
69
+ - Your converter will be automatically selected and used related to the form and to extensions list
70
+ - Add a perform method which
71
+ - define a text command
72
+ - call execute
73
+ - return the converted file(s) path
74
+
75
+ ```ruby
76
+ class MyConverter < Proselytism::Converters::Base
77
+ form :ext1, :ext2
78
+ to :ext3, :ext4
79
+
80
+ def perform(origin, options={})
81
+ destination = destination_file_path(origin, options)
82
+ command = "pdftotext #{origin} #{destination} 2>&1"
83
+ execute command
84
+ destination
85
+ end
86
+ end
87
+ ```
88
+ ## Contributing
89
+
90
+ 1. Fork it
91
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
92
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
93
+ 4. Push to the branch (`git push origin my-new-feature`)
94
+ 5. Create new Pull Request
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require "rspec/core/rake_task"
4
+
5
+ RSpec::Core::RakeTask.new
6
+
7
+ task :default => :spec
8
+ task :test => :spec
9
+
@@ -0,0 +1,14 @@
1
+ require 'rails/generators'
2
+
3
+ module Proselytism
4
+ class ConfigGenerator < Rails::Generators::Base
5
+
6
+ self.source_paths << File.join(File.dirname(__FILE__), 'templates')
7
+
8
+ desc "Generate Proselytism config file load before rails initialize"
9
+ def create_config_file
10
+ template 'config.yml', 'config/proselytism.yml'
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ require 'rails/generators'
2
+
3
+ module Proselytism
4
+ class InitializerGenerator < Rails::Generators::Base
5
+
6
+ self.source_paths << File.join(File.dirname(__FILE__), 'templates')
7
+
8
+
9
+ desc "Generate an initializer file to override the configuration file"
10
+ def create_initializer_file
11
+ template 'initializer.rb', 'config/initializers/proselytism.rb'
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,29 @@
1
+ test:
2
+ open_office_path: "/Applications/OpenOffice.org.app/Contents/MacOS/soffice"
3
+ oo_server_bridge: "JOD"
4
+ oo_server_max_cpu: 95 #percent
5
+ oo_server_max_cpu_delay: 2 #seconds
6
+ oo_server_availability_delay: 6 # seconds
7
+ oo_server_start_delay: 2 #seconds
8
+ oo_conversion_max_tries: 2
9
+ oo_conversion_max_time: 4 #seconds
10
+
11
+ development:
12
+ open_office_path: "/Applications/OpenOffice.org.app/Contents/MacOS/soffice"
13
+ oo_server_bridge: "JOD"
14
+ oo_server_max_cpu: 95 #percent
15
+ oo_server_max_cpu_delay: 2 #seconds
16
+ oo_server_availability_delay: 6 # seconds
17
+ oo_server_start_delay: 2 #seconds
18
+ oo_conversion_max_tries: 2
19
+ oo_conversion_max_time: 4 #seconds
20
+
21
+ production:
22
+ open_office_path: "/usr/bin/soffice"
23
+ oo_server_bridge: "PYOD"
24
+ oo_server_max_cpu: 98 #percent
25
+ oo_server_max_cpu_delay: 2 #seconds
26
+ oo_server_availability_delay: 7 #seconds
27
+ oo_server_start_delay: 2 #seconds
28
+ oo_conversion_max_tries: 2
29
+ oo_conversion_max_time: 5 #seconds
@@ -0,0 +1,30 @@
1
+ Proselytism.config do |config|
2
+ #Open Office binary path /usr/bin/soffice for unix or /Applications/OpenOffice.org.app/Contents/MacOS/soffice for mac OSX
3
+ config.open_office_path = "/Applications/OpenOffice.org.app/Contents/MacOS/soffice"
4
+
5
+ #Bridge PYOD or JOD (PYOD dones't work as is on mac)
6
+ config.oo_server_bridge = "JOD"
7
+
8
+ #When ensuring server availability (before converting a doc),
9
+ #Restart the server if all processs are above max cpu during max_cpu_delay
10
+ config.oo_server_max_cpu = 95 #percent
11
+ config.oo_server_max_cpu_delay = 2 #seconds
12
+
13
+ #Time the server waits for availability before converting a doc
14
+ config.oo_server_availability_delay = 6 # seconds
15
+
16
+ # Wait time after server start
17
+ config.oo_server_start_delay = 2 #seconds
18
+
19
+
20
+ config.oo_conversion_max_tries = 2
21
+
22
+ #max time for performing a conversion (then restart an attempt)
23
+ config.oo_conversion_max_time = 5 #seconds
24
+
25
+ #Path where conversion are done by default system temp dir
26
+ #config.tmp_path = File.expand_path("../tmp", __FILE__)
27
+
28
+ #Logger (otherwhise rails logger)
29
+ #config.logger = ActiveSupport::BufferedLogger.new("your/log/path")
30
+ end
@@ -0,0 +1,14 @@
1
+ require "active_support/core_ext"
2
+
3
+ require "proselytism/version"
4
+ require "proselytism/shared"
5
+ require "proselytism/proselytism"
6
+ require "proselytism/converter"
7
+
8
+ require "proselytism/converters/open_office"
9
+ require "proselytism/converters/pdf_to_text"
10
+ require "proselytism/converters/pdf_images"
11
+ require "proselytism/converters/ppm_to_jpeg"
12
+
13
+
14
+ require "proselytism/engine" if defined? Rails
@@ -0,0 +1,74 @@
1
+ require 'singleton'
2
+ require 'active_support/core_ext/class/attribute'
3
+ require 'active_support/core_ext/module/aliasing'
4
+
5
+ module Proselytism
6
+ module Converters
7
+ class Base
8
+ include ::Singleton
9
+ include Proselytism::Shared
10
+ class_attribute :from, :to, :subclasses
11
+
12
+ class Error < Exception; end
13
+
14
+ def config
15
+ Proselytism.config
16
+ end
17
+
18
+ def destination_file_path(origin, options={})
19
+ if options[:dest]
20
+ options[:dest]
21
+ else
22
+ File.join options[:dir], File.basename(origin).gsub(/\..*$/, options[:folder] ? '' : ".#{options[:to]}")
23
+ end
24
+ end
25
+
26
+ #call perform logging duration and potential errors
27
+ def convert(file_path, options={})
28
+ log :debug, "convert #{file_path} to :#{options[:to]}" do
29
+ begin
30
+ perform(file_path, options)
31
+ rescue Error => e
32
+ log :error, e.message
33
+ raise e
34
+ end
35
+ end
36
+ end
37
+
38
+ #execute a command and raise error with the command output if it fails
39
+ def execute(command)
40
+ output = `#{command}`
41
+ if $?.exitstatus != 0
42
+ raise self.class::Error, ["#{self.class.name} unable to exec command: #{command}",'--', output,'--'].join("\n")
43
+ end
44
+ $?.exitstatus == 0
45
+ end
46
+
47
+ singleton_class.class_eval do
48
+
49
+ def inherited_with_registering(subclass)
50
+ self.subclasses ||= []
51
+ self.subclasses << subclass
52
+ inherited_without_registering(subclass)
53
+ subclass
54
+ end
55
+
56
+ alias_method_chain :inherited, :registering
57
+
58
+ [:from, :to].each do |attr|
59
+ define_method "#{attr}_with_default" do |*formats|
60
+ if formats.length
61
+ self.send "#{attr}=", formats.map(&:to_sym)
62
+ else
63
+ self.send "#{attr}_without_default" || []
64
+ end
65
+ end
66
+ alias_method_chain attr, :default
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+ end
74
+
@@ -0,0 +1,183 @@
1
+ require 'system_timer'
2
+
3
+ class Proselytism::Converters::OpenOffice < Proselytism::Converters::Base
4
+
5
+ class Error < parent::Base::Error; end
6
+
7
+ from :odt, :doc, :rtf, :sxw, :docx, :txt, :html, :htm, :wps
8
+ to :odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps
9
+
10
+ module Bridges
11
+ module JOD
12
+ def self.command
13
+ "java -jar #{File.expand_path('open_office/odconverters/jodconverter-2.2.2/lib/jodconverter-cli-2.2.2.jar', File.dirname(__FILE__))}"
14
+ end
15
+ end
16
+ module PYOD
17
+ def self.command
18
+ "python #{File.expand_path('open_office/odconverters/pyodconverter.py', File.dirname(__FILE__))}"
19
+ end
20
+ end
21
+ end
22
+
23
+ # Converts documents
24
+ def perform(origin, options={})
25
+ destination = destination_file_path(origin, options)
26
+ command = "#{Proselytism::Converters::OpenOffice}::Bridges::#{config.oo_server_bridge}".constantize.command + " '#{origin}' '#{destination}' 2>&1"
27
+ server.perform { execute(command) }
28
+ destination
29
+ end
30
+
31
+
32
+ # HACK pour contourner un comportement ?trange d'OpenOffice, normalement les enregistrements
33
+ # se font en UTF-8, mais parfois pour une raison obscure les fichiers texte sont en ISO-8859-1
34
+ # donc on rajoute un test pour re-convertir dans l'encodage qu'on attend
35
+ def convert_txt_to_utf8(file_path)
36
+ if `file #{file_path}` =~ /ISO/
37
+ system("iconv --from-code ISO-8859-1 --to-code UTF-8 #{file_path} > tmp_iconv.txt && mv tmp_iconv.txt #{file_path}")
38
+ end
39
+ end
40
+
41
+ def server
42
+ Server.instance
43
+ end
44
+
45
+
46
+ class Server
47
+ include Singleton
48
+ include Proselytism::Shared
49
+ class Error < Proselytism::Converters::OpenOffice::Error; end
50
+
51
+ def config
52
+ Proselytism.config
53
+ end
54
+
55
+ # Run a block with a timeout and retry if the first execution fails
56
+ def perform(&block)
57
+ attempts = 1
58
+ begin
59
+ ensure_available
60
+ Timeout::timeout(config.oo_conversion_max_time,&block)
61
+ rescue Timeout::Error, Proselytism::Converters::OpenOffice::Error
62
+ attempts += 1
63
+ restart!
64
+ retry unless attempts > config.oo_conversion_max_tries
65
+ raise Error, "OpenOffice server perform timeout"
66
+ end
67
+ end
68
+
69
+ # Restart if running or start new instance
70
+ def restart!
71
+ stop! if running?
72
+ start!
73
+ end
74
+
75
+ # Start new instance
76
+ def start!
77
+ log :debug, "OpenOffice server started" do
78
+ system "#{config.open_office_path} -headless -accept=\"socket,host=127.0.0.1,port=8100\;urp\;\" -nofirststartwizard -nologo -nocrashreport -norestore -nolockcheck -nodefault &"
79
+ begin
80
+ SystemTimer.timeout_after(3) do
81
+ while !running?
82
+ log :debug, ". Waiting OpenOffice server to run"
83
+ sleep(0.1)
84
+ end
85
+ end
86
+ rescue
87
+ raise Error, "Could not start OpenOffice"
88
+ end
89
+ # OpenOffice needs some time to wake up
90
+ sleep(config.oo_server_start_delay)
91
+ end
92
+ nil
93
+ end
94
+
95
+ def start_with_running_control!
96
+ if running?
97
+ log :debug, "OpenOffice server is allready running"
98
+ else
99
+ start_without_running_control!
100
+ end
101
+ end
102
+ alias_method_chain :start!, :running_control
103
+
104
+ # Kill running instance
105
+ def stop!
106
+ #operating_system = `uname -s`
107
+ #command = "killall -u `whoami` -#{operating_system == "Linux" ? 'q' : 'm'} soffice"
108
+ begin
109
+ Timeout::timeout(3) do
110
+ loop do
111
+ system("killall -9 soffice && killall -9 soffice.bin > /dev/null 2>&1")
112
+ break unless running?
113
+ sleep(0.2)
114
+ end
115
+ end
116
+ rescue Timeout::Error
117
+ raise Error, "Could not kill OpenOffice !!"
118
+ ensure
119
+ # Remove user profile
120
+ system("rm -rf ~/openoffice.org*")
121
+ log :debug, "OpenOffice server stopped"
122
+ end
123
+ end
124
+
125
+ def stop_with_running_control!
126
+ if !running?
127
+ log :debug, "OpenOffice server is allready stoped"
128
+ else
129
+ stop_without_running_control!
130
+ end
131
+ end
132
+ alias_method_chain :stop!, :running_control
133
+
134
+ # Is OpenOffice server running?
135
+ def running?
136
+ !`pgrep soffice`.blank?
137
+ end
138
+
139
+
140
+ # Is the current instance stuck ?
141
+ def stalled?
142
+ begin
143
+ SystemTimer.timeout_after config.oo_server_max_cpu_delay do
144
+ loop do
145
+ cpu_usage = `ps -Ao pcpu,pid,comm= | grep soffice`.split(/\n/).map{|usage| /^\s*\d+/.match(usage)[0].strip.to_i}
146
+ break unless cpu_usage.all?{|usage| usage > config.oo_server_max_cpu }
147
+ sleep(0.2)
148
+ end
149
+ end
150
+ false
151
+ rescue
152
+ log :error, "OpenOffice server stalled : \n---\n" + `ps -Ao pcpu,pid,comm | grep soffice` + "\n---"
153
+ true
154
+ end
155
+ end
156
+
157
+ def available?
158
+ `ps -o pid,stat,command |grep soffice`.match(/\d+\s(\w)/i)[1] == "S"
159
+ end
160
+
161
+ # Make sure there will be an available instance
162
+ def ensure_available
163
+ start! unless running?
164
+ restart! if stalled?
165
+ begin
166
+ SystemTimer.timeout_after config.oo_server_availability_delay do
167
+ while !available?
168
+ log :debug, ". Waiting OpenOffice server availability"
169
+ sleep(0.5)
170
+ end
171
+ end
172
+ rescue Timeout::Error
173
+ raise Error, "OpenOffice Server unavailable"
174
+ end
175
+ true
176
+ end
177
+
178
+ end
179
+
180
+
181
+
182
+ end
183
+