documentalist 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/Manifest +48 -12
  2. data/README.rdoc +1 -1
  3. data/Rakefile +28 -12
  4. data/config/default.yml +17 -0
  5. data/documentalist.gemspec +40 -34
  6. data/lib/backends/net_pbm.rb +10 -0
  7. data/lib/backends/odf_merge.rb +64 -0
  8. data/lib/backends/open_office.rb +108 -0
  9. data/lib/backends/open_office/bridges/jodconverter-2.2.2/ChangeLog.txt +119 -0
  10. data/lib/backends/open_office/bridges/jodconverter-2.2.2/LICENSE.txt +504 -0
  11. data/lib/backends/open_office/bridges/jodconverter-2.2.2/README.txt +58 -0
  12. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/jodconverter-2.2.2-javadoc.jar +0 -0
  13. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-commons-io.txt +203 -0
  14. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-openoffice.org.txt +8 -0
  15. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-slf4j.txt +24 -0
  16. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-xstream.txt +27 -0
  17. data/lib/backends/open_office/bridges/jodconverter-2.2.2/document-formats.xml +513 -0
  18. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/DEPENDENCIES.txt +17 -0
  19. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/commons-cli-1.2.jar +0 -0
  20. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/commons-io-1.4.jar +0 -0
  21. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/jodconverter-2.2.2.jar +0 -0
  22. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/jodconverter-cli-2.2.2.jar +0 -0
  23. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/juh-3.0.1.jar +0 -0
  24. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/jurt-3.0.1.jar +0 -0
  25. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/ridl-3.0.1.jar +0 -0
  26. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/slf4j-api-1.5.6.jar +0 -0
  27. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/slf4j-jdk14-1.5.6.jar +0 -0
  28. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/unoil-3.0.1.jar +0 -0
  29. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/xstream-1.3.1.jar +0 -0
  30. data/lib/backends/open_office/bridges/jodconverter-2.2.2/src/jodconverter-2.2.2-sources.jar +0 -0
  31. data/lib/backends/open_office/bridges/jodconverter-2.2.2/src/jodconverter-cli-2.2.2-sources.jar +0 -0
  32. data/lib/{DocumentConverter.py → backends/open_office/bridges/pyodconverter.py} +0 -0
  33. data/lib/{open_office → backends/open_office}/server.rb +8 -5
  34. data/lib/backends/pdf_tools.rb +14 -0
  35. data/lib/documentalist.rb +130 -42
  36. data/lib/tasks/tasks.rb +6 -0
  37. data/rails/config/documentalist.yml.tpl +67 -0
  38. data/rails/init.rb +12 -1
  39. data/rails/initialize_configuration.rb +6 -0
  40. data/test/documentalist_test.rb +48 -25
  41. data/test/fixtures/{fixture.odt → fixture_001.odt} +0 -0
  42. data/test/net_pbm_test.rb +7 -0
  43. data/test/odf_merge_test.rb +56 -0
  44. data/test/open_office_test.rb +70 -13
  45. data/test/pdf_tools_test.rb +8 -0
  46. data/test/rails_integration_test.rb +39 -0
  47. data/test/test_helper.rb +29 -0
  48. metadata +112 -12
@@ -0,0 +1,17 @@
1
+ To use the library in your own Java app you need
2
+
3
+ * commons-io
4
+ * jodconverter
5
+ * juh
6
+ * jurt
7
+ * ridl
8
+ * slf4j-api
9
+ * slf4j-jdk14 or another slf4j implementation - see http://slf4j.org
10
+ * unoil
11
+ * xstream - only if you use XmlDocumentFormatRegistry
12
+
13
+ The command line interface additionally requires
14
+
15
+ * commons-cli
16
+ * jodconverter-cli
17
+
@@ -1,9 +1,10 @@
1
- require 'timeout'
1
+ require 'system_timer'
2
2
  require 'tmpdir'
3
3
 
4
4
  module OpenOffice
5
5
  module Server
6
6
  # Path to the Python executable
7
+ # TODO : Eww, use config instead
7
8
  PYTHON_PATH = "/usr/bin/python"
8
9
 
9
10
  # Server can convert from the following file formats
@@ -13,6 +14,7 @@ module OpenOffice
13
14
  CONVERT_TO = [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]
14
15
 
15
16
  # Python conversion script path
17
+ # TODO : Wrong
16
18
  PY_OD_CONVERTER = File.join(File.dirname(__FILE__), "../DocumentConverter.py")
17
19
 
18
20
  # Maximum allowed CPU usage for an OpenOffice process
@@ -22,6 +24,7 @@ module OpenOffice
22
24
  SERVER_START_DELAY = 4
23
25
 
24
26
  # Log file
27
+ # TODO : Wrong
25
28
  LOG_FILE = Object.const_defined?(:RAILS_ROOT) ? File.join(RAILS_ROOT, "log", "openoffice.log") : ""
26
29
 
27
30
  def self.convert(origin, options = {:to => :txt})
@@ -37,7 +40,7 @@ module OpenOffice
37
40
  raise "Can't convert #{origin} to #{options[:to]}"
38
41
  end
39
42
 
40
- timeout(10, :attempts => 2) do
43
+ Documentalist.timeout(10, :attempts => 2) do
41
44
  system("#{PYTHON_PATH} #{PY_OD_CONVERTER} #{origin} #{destination} > /dev/null 2>&1")
42
45
 
43
46
  # HACK : sometimes text files get saved in ISO-8859-1 instead of regular UTF-8, so we force
@@ -70,7 +73,7 @@ module OpenOffice
70
73
  system("/usr/bin/soffice -headless -accept=\"socket,host=127.0.0.1,port=8100;urp;\" -nofirststartwizard -nologo -nocrashreport -norestore -nolockcheck -nodefault #{">>" unless LOG_FILE.empty?} #{LOG_FILE} 2>&1 &")
71
74
 
72
75
  begin
73
- timeout(2) do
76
+ SystemTimer.timeout(2.seconds) do
74
77
  while !running?
75
78
  print "."
76
79
  end
@@ -90,7 +93,7 @@ module OpenOffice
90
93
  raise "Not running!" unless running?
91
94
 
92
95
  begin
93
- timeout(3, :attempts => 2) do
96
+ Documentalist.timeout(3, :attempts => 2) do
94
97
  while(running?)
95
98
  system("pkill -9 office")
96
99
  end
@@ -129,7 +132,7 @@ module OpenOffice
129
132
  if block_given?
130
133
  attempts = options[:attempts] || 1
131
134
  begin
132
- Timeout::timeout(max_time) do
135
+ SystemTimer.timeout(max_time) do
133
136
  yield
134
137
  end
135
138
  rescue Timeout::Error
@@ -0,0 +1,14 @@
1
+ # To change this template, choose Tools | Templates
2
+ # and open the template in the editor.
3
+
4
+ module Documentalist
5
+ module PdfTools
6
+ def self.convert(origin, options)
7
+ if system("pdftotext #{origin} #{options[:destination]} > /dev/null 2>&1")
8
+ options[:destination]
9
+ else
10
+ raise "PdfTools failed"
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,63 +1,151 @@
1
1
  require 'rubygems'
2
- require 'erb'
3
- require 'fileutils'
4
- require 'tmpdir'
5
- require 'zip/zip'
6
- require 'open_office/server'
2
+ require 'yaml'
3
+ require 'system_timer'
4
+ require 'logger'
5
+
6
+ # Require all backends
7
+ Dir.glob(File.join(File.dirname(__FILE__), 'backends', '*.rb')).each do |backend|
8
+ require backend
9
+ end
7
10
 
8
11
  module Documentalist
9
- def self.merge(str, options = {})
10
- locals = options[:locals]
12
+ @@config = {}
13
+ @@logger = nil
11
14
 
12
- if locals and locals.is_a? Hash
13
- locals.each do |k,v|
14
- instance_variable_set("@#{k.to_s}".to_sym, v)
15
- end
16
- end
15
+ def self.config
16
+ default_config! unless config?
17
+ @@config
18
+ end
17
19
 
18
- ERB.new(str).result(binding)
20
+ def self.config=(hash)
21
+ # We want to symbolize keys ourselves since we're not depending on Active Support
22
+ @@config = symbolize hash
19
23
  end
20
24
 
21
- def self.get_contents(odt_file)
22
- contents = ""
23
- Zip::ZipFile.open(odt_file) { |zip| contents = zip.read("content.xml") }
24
- contents.gsub("&lt;%", "<%").gsub("%&gt;", "%>")
25
+ def self.config?
26
+ @@config != {}
25
27
  end
26
28
 
27
- def self.merge_template(template, options = {})
28
- # Get template contents
29
- tmp_contents= Tempfile.new("officer-contents")
30
- tmp_contents.write(merge(get_contents(template), :locals => options[:locals]))
31
- tmp_contents.close
29
+ def self.default_config!
30
+ config_from_yaml! File.join(File.dirname(__FILE__), %w{.. config default.yml})
31
+ end
32
+
33
+ def self.config_from_yaml!(file, options = {})
34
+ self.config = YAML::load(File.open(file))
35
+ self.config = config[options[:section].to_sym] if options[:section]
36
+ end
37
+
38
+ BACKENDS = {
39
+ OpenOffice => {[:odt, :doc, :rtf, :docx, :txt, :html, :htm, :wps] => [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]},
40
+ NetPBM => {:ppm => [:jpg, :jpeg]},
41
+ PdfTools => {:pdf => :txt},
42
+
43
+ # Find a better pattern to pick backend, this one smells pretty bad
44
+ # WkHTML2PDF => {[:html, :htm] => :pdf}
45
+ }
46
+
47
+ # Finds the relevant server to perform the conversion
48
+ def self.backend_for_conversion(origin, destination)
49
+ origin = origin.to_s.gsub(/.*\./, "").to_sym
50
+ destination = destination.to_s.gsub(/.*\./, "").to_sym
51
+
52
+ BACKENDS.detect do |s, conversions|
53
+ conversions.keys.flatten.include?(origin) and conversions.values.flatten.include?(destination)
54
+ end.to_a.first
55
+ end
56
+
57
+ # Takes all conversion requests and dispatches them appropriately
58
+ def self.convert(file, options={})
59
+ raise "#{file} does not exist !" unless File.exist?(file)
60
+
61
+ unless options[:to] or options[:to_format]
62
+ raise Documentalist::Error.new("No destination or format was given")
63
+ end
64
+
65
+ # Convert to plain text by default
66
+ options[:to_format] = options[:to_format] ? options[:to_format].to_sym : :txt
67
+
68
+ unless options[:to]
69
+ options[:to] = file.gsub(/#{"\\" + File.extname(file)}$/, ".#{options[:to_format].to_s}")
70
+ end
71
+
72
+ options[:from_format] = File.extname(file).gsub(/\./, "").to_sym
32
73
 
33
- # Copy the template so we can merge the data into the copy
34
- tmp_merged_template = File.join(Dir.tmpdir, "merged-template-#{rand(10**9)}#{File.extname(template)}")
35
- FileUtils.cp(template, tmp_merged_template)
74
+ backend = backend_for_conversion(options[:from_format], options[:to_format])
75
+ converted = backend.convert(file, options)
36
76
 
37
- # Stuff the merged contents.xml into the OpenDocument zip
38
- Zip::ZipFile.open(tmp_merged_template) do |zip|
39
- zip.replace("content.xml", tmp_contents.path)
40
- zip.commit
77
+ yield(converted) if block_given?
78
+ converted
79
+ end
80
+
81
+ def self.extract_text(file)
82
+ converted = convert(file, :to => :txt)
83
+ if converted and File.exist?(converted)
84
+ text = File.open(converted).read.toutf8
85
+ FileUtils.rm(converted)
86
+
87
+ yield(extracted_text) if block_given?
88
+ text
41
89
  end
90
+ end
91
+
92
+ def self.extract_images(file)
93
+ temp_dir = File.join(CONVERSIONS_PATH, (Time.new.to_f*100_000).to_i.to_s)
94
+
95
+ if File.extname(file) == '.pdf'
96
+ temp_file = File.join(temp_dir, File.basename(file))
42
97
 
43
- # Remove the merged contents.xml
44
- tmp_contents.unlink
98
+ system "mkdir #{temp_dir} && cp #{file} #{temp_file}"
99
+ system "cd #{temp_dir} && pdfimages #{temp_file} 'img'"
45
100
 
46
- # Manages the converted file depending on the context
47
- if options[:to]
48
- if File.extname(options[:to]) == File.extname(template)
49
- FileUtils.mv(tmp_merged_template, options[:to])
50
- else
51
- OpenOffice::Server.convert(tmp_merged_template, options[:to])
52
- FileUtils.rm(tmp_merged_template)
101
+ Dir.glob(File.join(temp_dir, "*.ppm")).each do |ppm_image|
102
+ Documentalist.convert(ppm_image, :to => :jpeg)
53
103
  end
54
104
  else
55
- FileUtils.rm(template)
56
- FileUtils.mv(tmp_merged_template, template)
105
+ convert file, :to => :html, :directory => temp_dir
57
106
  end
107
+
108
+ image_file_names = Dir.glob(File.join(temp_dir, "*.{jpg,jpeg,bmp,tif,tiff,gif,png}"))
109
+
110
+ yield(image_file_names) if block_given?
111
+ image_file_names
58
112
  end
59
113
 
60
- def self.convert(from, to)
61
- OpenOffice::Server.convert(from, :to => to)
114
+ # Runs a block with a system-enforced timeout and optionally retry with an
115
+ # optional sleep between attempts of running the given block.
116
+ # All times are in seconds.
117
+ def self.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil})
118
+ if block_given?
119
+ attempts = options[:attempts] || 1
120
+ begin
121
+ SystemTimer.timeout time_limit do
122
+ yield
123
+ end
124
+ rescue Timeout::Error
125
+ attempts -= 1
126
+ sleep(options[:sleep]) if options[:sleep]
127
+ retry unless attempts.zero?
128
+ raise
129
+ end
130
+ end
62
131
  end
132
+
133
+ def self.logger
134
+ unless @@logger
135
+ @@logger = Logger.new(Documentalist.config[:log_file])
136
+ @@logger.level = Logger.const_get(config[:log_level] ? config[:log_level].upcase : "WARN")
137
+ end
138
+
139
+ @@logger
140
+ end
141
+
142
+ # Returns a new hash with recursively symbolized keys
143
+ def self.symbolize(hash)
144
+ hash.each_key do |key|
145
+ hash[key.to_sym] = hash.delete key
146
+ hash[key.to_sym] = symbolize(hash[key.to_sym]) if hash[key.to_sym].is_a?(Hash)
147
+ end
148
+ end
149
+
150
+ class Error < RuntimeError; end
63
151
  end
@@ -0,0 +1,6 @@
1
+ namespace :documentalist do
2
+ desc "This task checks that the required binaries and libraries are available on the target system"
3
+ task :check_install do
4
+ puts "Checking install..."
5
+ end
6
+ end
@@ -0,0 +1,67 @@
1
+ # Sample Rails configuration file
2
+ # Optional settings are :
3
+ # * log_path : Allows you to override the default log file [Rail.root/log/documentalist.log]
4
+
5
+ development:
6
+ # Python configuration
7
+ python:
8
+ path: /usr/bin/python
9
+
10
+ # Java configuration
11
+ java:
12
+ path: /usr/bin/java
13
+
14
+ # OpenOffice configuration
15
+ open_office:
16
+ # Path to the OpenOpffice binary
17
+ path: /usr/bin/soffice
18
+
19
+ # Select desired bridge between PYOD and JOD
20
+ bridge: JOD
21
+
22
+ # Maximum allowed CPU usage before the process is considered stalled
23
+ max_cpu: 80
24
+
25
+ # OpenOffice server allowed startup time (seconds)
26
+ max_startup_time: 4
27
+
28
+ # OpenOffice server allowed waking up time (between startup and actual processing)
29
+ wakeup_time: 3
30
+
31
+ # Conversion tries before giving up
32
+ max_conversion_attempts: 3
33
+
34
+ # Maximum allowed time for converting a document
35
+ max_conversion_time: 6
36
+
37
+ test:
38
+ python:
39
+ path: /usr/bin/python
40
+
41
+ java:
42
+ path: /usr/bin/java
43
+
44
+ open_office:
45
+ path: /usr/bin/soffice
46
+ bridge: JOD
47
+ max_cpu: 80
48
+ max_startup_time: 4
49
+ wakeup_time: 3
50
+ max_conversion_attempts: 3
51
+ max_conversion_time: 6
52
+
53
+ production:
54
+ python:
55
+ path: /usr/bin/python
56
+
57
+ java:
58
+ path: /usr/bin/java
59
+
60
+ open_office:
61
+ path: /usr/bin/soffice
62
+ bridge: JOD
63
+ max_cpu: 80
64
+ max_startup_time: 4
65
+ wakeup_time: 3
66
+ max_conversion_attempts: 3
67
+ max_conversion_time: 6
@@ -1 +1,12 @@
1
- require 'documentalist'
1
+ require 'yaml'
2
+
3
+ require File.join(File.dirname(__FILE__), %w{initialize_configuration})
4
+ require File.join(File.dirname(__FILE__), %w{.. lib documentalist})
5
+
6
+ # Load configuration from Rails.root/config/documentalist.yml
7
+ Documentalist.config_from_yaml! File.join(RAILS_ROOT, %w{config documentalist.yml}), :section => RAILS_ENV
8
+
9
+ # Set a default for the logfile if it hasn't been provided by the configuration file
10
+ unless Documentalist.config[:logfile]
11
+ Documentalist.config[:logfile] = File.join(RAILS_ROOT, %w{log documentalist-#{RAILS_ENV}.log})
12
+ end
@@ -0,0 +1,6 @@
1
+ # Initializes a standard configuration file in Rails.root/config/documentalist.yml
2
+ unless File.exists?(File.join(RAILS_ROOT , %w{config documentalist.yml}))
3
+ FileUtils.cp(File.join(File.dirname(__FILE__), %w{config documentalist.yml.tpl}), File.join(RAILS_ROOT, %w{config documentalist.yml}))
4
+ end
5
+
6
+
@@ -1,40 +1,63 @@
1
- require 'test/unit'
2
- require 'documentalist'
1
+ require 'test_helper'
2
+ require 'system_timer'
3
+ require 'tmpdir'
3
4
 
4
5
  class DocumentalistTest < Test::Unit::TestCase
5
- @@odt_fixture = File.join(File.dirname(__FILE__), "fixtures/fixture.odt")
6
+ include FlexMock::TestCase
6
7
 
7
- def test_merge
8
- template = "<%= @var1 %><%= 1.upto(3).map{ |n| n.to_s }.join %><%= @var2 %>"
8
+ # Test the custom symbolize method used as a replacement for the Active Support version
9
+ def test_symbolize
10
+ hash = { "a" => "b",
11
+ "c" => {
12
+ "d" => "e"
13
+ }
14
+ }
9
15
 
10
- merged = Documentalist.merge(template, :locals => {
11
- :var1 => "test",
12
- :var2 => "working?"
16
+ symbolized = { :a => "b",
17
+ :c => {
18
+ :d => "e"
13
19
  }
14
- )
20
+ }
21
+
22
+ assert_equal Documentalist.send(:symbolize, hash),
23
+ symbolized,
24
+ "Hash wasn't properly symbolized"
25
+ end
15
26
 
16
- assert_equal "test123working?", merged, "Merge wasn't performed correctly"
27
+ # Test that we use a system timeout and not a green thread based timeout that
28
+ # could possibly not work on some long external system calls
29
+ def test_timeout_uses_system_timeout
30
+ flexmock(SystemTimer).should_receive(:timeout).once
31
+ Documentalist.timeout(0.1) { }
17
32
  end
18
33
 
19
- def test_read_zipped_odt
20
- contents = Documentalist.get_contents(@@odt_fixture)
34
+ # Test that we have a default configuration for Documentalist even if
35
+ def test_default_config
36
+ # Check that we did not get some Rails context from other tests
37
+ assert !Object.const_defined?(:RAILS_ENV)
21
38
 
22
- assert_match /Hello/, contents
23
- assert_match /thing/, contents
24
- assert !(contents =~ /%&gt;/)
25
- assert !(contents =~ /&lt;%=/)
39
+ # Check that at least a configuration key has been magically set
40
+ assert Documentalist.config[:open_office]
26
41
  end
27
42
 
28
- def test_odt_merge
29
- template = "#{File.join(File.dirname(__FILE__), "fixtures/fixture.odt")}"
30
- result = "#{File.join(File.dirname(__FILE__), "fixtures/result.odt")}"
43
+ def test_logger
44
+ log_file = File.join(Dir.tmpdir, "#{rand(10 ** 9).to_s}.log")
45
+
46
+ Documentalist.config[:log_file] = log_file
47
+ assert !File.exists?(log_file), "Log file already exists"
48
+
49
+ Documentalist.logger
50
+ assert File.exists?(log_file), "Log file should have been created"
51
+
52
+ assert_no_difference("File.size(\"#{log_file}\")", "Nothing should have been written") do
53
+ Documentalist.logger.debug("This message should go nowhere")
54
+ end
31
55
 
32
- Documentalist.merge_template(template,
33
- :locals => {:thing => "world"},
34
- :to => result
35
- )
56
+ assert_difference("File.size(\"#{log_file}\")", nil, "Nothing should have been written") do
57
+ Documentalist.logger.warn("This message should be written !")
58
+ end
36
59
 
37
- assert /world/, Documentalist.get_contents(result)
38
- File.delete(result)
60
+ FileUtils.rm(log_file)
61
+ assert !File.exists?(log_file), "Log file hasn't been removed properly"
39
62
  end
40
63
  end