documentalist 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/Manifest +48 -12
  2. data/README.rdoc +1 -1
  3. data/Rakefile +28 -12
  4. data/config/default.yml +17 -0
  5. data/documentalist.gemspec +40 -34
  6. data/lib/backends/net_pbm.rb +10 -0
  7. data/lib/backends/odf_merge.rb +64 -0
  8. data/lib/backends/open_office.rb +108 -0
  9. data/lib/backends/open_office/bridges/jodconverter-2.2.2/ChangeLog.txt +119 -0
  10. data/lib/backends/open_office/bridges/jodconverter-2.2.2/LICENSE.txt +504 -0
  11. data/lib/backends/open_office/bridges/jodconverter-2.2.2/README.txt +58 -0
  12. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/jodconverter-2.2.2-javadoc.jar +0 -0
  13. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-commons-io.txt +203 -0
  14. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-openoffice.org.txt +8 -0
  15. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-slf4j.txt +24 -0
  16. data/lib/backends/open_office/bridges/jodconverter-2.2.2/docs/third-party-licenses/license-xstream.txt +27 -0
  17. data/lib/backends/open_office/bridges/jodconverter-2.2.2/document-formats.xml +513 -0
  18. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/DEPENDENCIES.txt +17 -0
  19. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/commons-cli-1.2.jar +0 -0
  20. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/commons-io-1.4.jar +0 -0
  21. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/jodconverter-2.2.2.jar +0 -0
  22. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/jodconverter-cli-2.2.2.jar +0 -0
  23. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/juh-3.0.1.jar +0 -0
  24. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/jurt-3.0.1.jar +0 -0
  25. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/ridl-3.0.1.jar +0 -0
  26. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/slf4j-api-1.5.6.jar +0 -0
  27. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/slf4j-jdk14-1.5.6.jar +0 -0
  28. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/unoil-3.0.1.jar +0 -0
  29. data/lib/backends/open_office/bridges/jodconverter-2.2.2/lib/xstream-1.3.1.jar +0 -0
  30. data/lib/backends/open_office/bridges/jodconverter-2.2.2/src/jodconverter-2.2.2-sources.jar +0 -0
  31. data/lib/backends/open_office/bridges/jodconverter-2.2.2/src/jodconverter-cli-2.2.2-sources.jar +0 -0
  32. data/lib/{DocumentConverter.py → backends/open_office/bridges/pyodconverter.py} +0 -0
  33. data/lib/{open_office → backends/open_office}/server.rb +8 -5
  34. data/lib/backends/pdf_tools.rb +14 -0
  35. data/lib/documentalist.rb +130 -42
  36. data/lib/tasks/tasks.rb +6 -0
  37. data/rails/config/documentalist.yml.tpl +67 -0
  38. data/rails/init.rb +12 -1
  39. data/rails/initialize_configuration.rb +6 -0
  40. data/test/documentalist_test.rb +48 -25
  41. data/test/fixtures/{fixture.odt → fixture_001.odt} +0 -0
  42. data/test/net_pbm_test.rb +7 -0
  43. data/test/odf_merge_test.rb +56 -0
  44. data/test/open_office_test.rb +70 -13
  45. data/test/pdf_tools_test.rb +8 -0
  46. data/test/rails_integration_test.rb +39 -0
  47. data/test/test_helper.rb +29 -0
  48. metadata +112 -12
@@ -0,0 +1,17 @@
1
+ To use the library in your own Java app you need
2
+
3
+ * commons-io
4
+ * jodconverter
5
+ * juh
6
+ * jurt
7
+ * ridl
8
+ * slf4j-api
9
+ * slf4j-jdk14 or another slf4j implementation - see http://slf4j.org
10
+ * unoil
11
+ * xstream - only if you use XmlDocumentFormatRegistry
12
+
13
+ The command line interface additionally requires
14
+
15
+ * commons-cli
16
+ * jodconverter-cli
17
+
@@ -1,9 +1,10 @@
1
- require 'timeout'
1
+ require 'system_timer'
2
2
  require 'tmpdir'
3
3
 
4
4
  module OpenOffice
5
5
  module Server
6
6
  # Path to the Python executable
7
+ # TODO : Eww, use config instead
7
8
  PYTHON_PATH = "/usr/bin/python"
8
9
 
9
10
  # Server can convert from the following file formats
@@ -13,6 +14,7 @@ module OpenOffice
13
14
  CONVERT_TO = [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]
14
15
 
15
16
  # Python conversion script path
17
+ # TODO : Wrong
16
18
  PY_OD_CONVERTER = File.join(File.dirname(__FILE__), "../DocumentConverter.py")
17
19
 
18
20
  # Maximum allowed CPU usage for an OpenOffice process
@@ -22,6 +24,7 @@ module OpenOffice
22
24
  SERVER_START_DELAY = 4
23
25
 
24
26
  # Log file
27
+ # TODO : Wrong
25
28
  LOG_FILE = Object.const_defined?(:RAILS_ROOT) ? File.join(RAILS_ROOT, "log", "openoffice.log") : ""
26
29
 
27
30
  def self.convert(origin, options = {:to => :txt})
@@ -37,7 +40,7 @@ module OpenOffice
37
40
  raise "Can't convert #{origin} to #{options[:to]}"
38
41
  end
39
42
 
40
- timeout(10, :attempts => 2) do
43
+ Documentalist.timeout(10, :attempts => 2) do
41
44
  system("#{PYTHON_PATH} #{PY_OD_CONVERTER} #{origin} #{destination} > /dev/null 2>&1")
42
45
 
43
46
  # HACK : sometimes text files get saved in ISO-8859-1 instead of regular UTF-8, so we force
@@ -70,7 +73,7 @@ module OpenOffice
70
73
  system("/usr/bin/soffice -headless -accept=\"socket,host=127.0.0.1,port=8100;urp;\" -nofirststartwizard -nologo -nocrashreport -norestore -nolockcheck -nodefault #{">>" unless LOG_FILE.empty?} #{LOG_FILE} 2>&1 &")
71
74
 
72
75
  begin
73
- timeout(2) do
76
+ SystemTimer.timeout(2.seconds) do
74
77
  while !running?
75
78
  print "."
76
79
  end
@@ -90,7 +93,7 @@ module OpenOffice
90
93
  raise "Not running!" unless running?
91
94
 
92
95
  begin
93
- timeout(3, :attempts => 2) do
96
+ Documentalist.timeout(3, :attempts => 2) do
94
97
  while(running?)
95
98
  system("pkill -9 office")
96
99
  end
@@ -129,7 +132,7 @@ module OpenOffice
129
132
  if block_given?
130
133
  attempts = options[:attempts] || 1
131
134
  begin
132
- Timeout::timeout(max_time) do
135
+ SystemTimer.timeout(max_time) do
133
136
  yield
134
137
  end
135
138
  rescue Timeout::Error
@@ -0,0 +1,14 @@
1
+ # To change this template, choose Tools | Templates
2
+ # and open the template in the editor.
3
+
4
+ module Documentalist
5
+ module PdfTools
6
+ def self.convert(origin, options)
7
+ if system("pdftotext #{origin} #{options[:destination]} > /dev/null 2>&1")
8
+ options[:destination]
9
+ else
10
+ raise "PdfTools failed"
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,63 +1,151 @@
1
1
  require 'rubygems'
2
- require 'erb'
3
- require 'fileutils'
4
- require 'tmpdir'
5
- require 'zip/zip'
6
- require 'open_office/server'
2
+ require 'yaml'
3
+ require 'system_timer'
4
+ require 'logger'
5
+
6
+ # Require all backends
7
+ Dir.glob(File.join(File.dirname(__FILE__), 'backends', '*.rb')).each do |backend|
8
+ require backend
9
+ end
7
10
 
8
11
  module Documentalist
9
- def self.merge(str, options = {})
10
- locals = options[:locals]
12
+ @@config = {}
13
+ @@logger = nil
11
14
 
12
- if locals and locals.is_a? Hash
13
- locals.each do |k,v|
14
- instance_variable_set("@#{k.to_s}".to_sym, v)
15
- end
16
- end
15
+ def self.config
16
+ default_config! unless config?
17
+ @@config
18
+ end
17
19
 
18
- ERB.new(str).result(binding)
20
+ def self.config=(hash)
21
+ # We want to symbolize keys ourselves since we're not depending on Active Support
22
+ @@config = symbolize hash
19
23
  end
20
24
 
21
- def self.get_contents(odt_file)
22
- contents = ""
23
- Zip::ZipFile.open(odt_file) { |zip| contents = zip.read("content.xml") }
24
- contents.gsub("&lt;%", "<%").gsub("%&gt;", "%>")
25
+ def self.config?
26
+ @@config != {}
25
27
  end
26
28
 
27
- def self.merge_template(template, options = {})
28
- # Get template contents
29
- tmp_contents= Tempfile.new("officer-contents")
30
- tmp_contents.write(merge(get_contents(template), :locals => options[:locals]))
31
- tmp_contents.close
29
+ def self.default_config!
30
+ config_from_yaml! File.join(File.dirname(__FILE__), %w{.. config default.yml})
31
+ end
32
+
33
+ def self.config_from_yaml!(file, options = {})
34
+ self.config = YAML::load(File.open(file))
35
+ self.config = config[options[:section].to_sym] if options[:section]
36
+ end
37
+
38
+ BACKENDS = {
39
+ OpenOffice => {[:odt, :doc, :rtf, :docx, :txt, :html, :htm, :wps] => [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]},
40
+ NetPBM => {:ppm => [:jpg, :jpeg]},
41
+ PdfTools => {:pdf => :txt},
42
+
43
+ # Find a better pattern to pick backend, this one smells pretty bad
44
+ # WkHTML2PDF => {[:html, :htm] => :pdf}
45
+ }
46
+
47
+ # Finds the relevant server to perform the conversion
48
+ def self.backend_for_conversion(origin, destination)
49
+ origin = origin.to_s.gsub(/.*\./, "").to_sym
50
+ destination = destination.to_s.gsub(/.*\./, "").to_sym
51
+
52
+ BACKENDS.detect do |s, conversions|
53
+ conversions.keys.flatten.include?(origin) and conversions.values.flatten.include?(destination)
54
+ end.to_a.first
55
+ end
56
+
57
+ # Takes all conversion requests and dispatches them appropriately
58
+ def self.convert(file, options={})
59
+ raise "#{file} does not exist !" unless File.exist?(file)
60
+
61
+ unless options[:to] or options[:to_format]
62
+ raise Documentalist::Error.new("No destination or format was given")
63
+ end
64
+
65
+ # Convert to plain text by default
66
+ options[:to_format] = options[:to_format] ? options[:to_format].to_sym : :txt
67
+
68
+ unless options[:to]
69
+ options[:to] = file.gsub(/#{"\\" + File.extname(file)}$/, ".#{options[:to_format].to_s}")
70
+ end
71
+
72
+ options[:from_format] = File.extname(file).gsub(/\./, "").to_sym
32
73
 
33
- # Copy the template so we can merge the data into the copy
34
- tmp_merged_template = File.join(Dir.tmpdir, "merged-template-#{rand(10**9)}#{File.extname(template)}")
35
- FileUtils.cp(template, tmp_merged_template)
74
+ backend = backend_for_conversion(options[:from_format], options[:to_format])
75
+ converted = backend.convert(file, options)
36
76
 
37
- # Stuff the merged contents.xml into the OpenDocument zip
38
- Zip::ZipFile.open(tmp_merged_template) do |zip|
39
- zip.replace("content.xml", tmp_contents.path)
40
- zip.commit
77
+ yield(converted) if block_given?
78
+ converted
79
+ end
80
+
81
+ def self.extract_text(file)
82
+ converted = convert(file, :to => :txt)
83
+ if converted and File.exist?(converted)
84
+ text = File.open(converted).read.toutf8
85
+ FileUtils.rm(converted)
86
+
87
+ yield(extracted_text) if block_given?
88
+ text
41
89
  end
90
+ end
91
+
92
+ def self.extract_images(file)
93
+ temp_dir = File.join(CONVERSIONS_PATH, (Time.new.to_f*100_000).to_i.to_s)
94
+
95
+ if File.extname(file) == '.pdf'
96
+ temp_file = File.join(temp_dir, File.basename(file))
42
97
 
43
- # Remove the merged contents.xml
44
- tmp_contents.unlink
98
+ system "mkdir #{temp_dir} && cp #{file} #{temp_file}"
99
+ system "cd #{temp_dir} && pdfimages #{temp_file} 'img'"
45
100
 
46
- # Manages the converted file depending on the context
47
- if options[:to]
48
- if File.extname(options[:to]) == File.extname(template)
49
- FileUtils.mv(tmp_merged_template, options[:to])
50
- else
51
- OpenOffice::Server.convert(tmp_merged_template, options[:to])
52
- FileUtils.rm(tmp_merged_template)
101
+ Dir.glob(File.join(temp_dir, "*.ppm")).each do |ppm_image|
102
+ Documentalist.convert(ppm_image, :to => :jpeg)
53
103
  end
54
104
  else
55
- FileUtils.rm(template)
56
- FileUtils.mv(tmp_merged_template, template)
105
+ convert file, :to => :html, :directory => temp_dir
57
106
  end
107
+
108
+ image_file_names = Dir.glob(File.join(temp_dir, "*.{jpg,jpeg,bmp,tif,tiff,gif,png}"))
109
+
110
+ yield(image_file_names) if block_given?
111
+ image_file_names
58
112
  end
59
113
 
60
- def self.convert(from, to)
61
- OpenOffice::Server.convert(from, :to => to)
114
+ # Runs a block with a system-enforced timeout and optionally retry with an
115
+ # optional sleep between attempts of running the given block.
116
+ # All times are in seconds.
117
+ def self.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil})
118
+ if block_given?
119
+ attempts = options[:attempts] || 1
120
+ begin
121
+ SystemTimer.timeout time_limit do
122
+ yield
123
+ end
124
+ rescue Timeout::Error
125
+ attempts -= 1
126
+ sleep(options[:sleep]) if options[:sleep]
127
+ retry unless attempts.zero?
128
+ raise
129
+ end
130
+ end
62
131
  end
132
+
133
+ def self.logger
134
+ unless @@logger
135
+ @@logger = Logger.new(Documentalist.config[:log_file])
136
+ @@logger.level = Logger.const_get(config[:log_level] ? config[:log_level].upcase : "WARN")
137
+ end
138
+
139
+ @@logger
140
+ end
141
+
142
+ # Returns a new hash with recursively symbolized keys
143
+ def self.symbolize(hash)
144
+ hash.each_key do |key|
145
+ hash[key.to_sym] = hash.delete key
146
+ hash[key.to_sym] = symbolize(hash[key.to_sym]) if hash[key.to_sym].is_a?(Hash)
147
+ end
148
+ end
149
+
150
+ class Error < RuntimeError; end
63
151
  end
@@ -0,0 +1,6 @@
1
+ namespace :documentalist do
2
+ desc "This task checks that the required binaries and libraries are available on the target system"
3
+ task :check_install do
4
+ puts "Checking install..."
5
+ end
6
+ end
@@ -0,0 +1,67 @@
1
+ # Sample Rails configuration file
2
+ # Optional settings are :
3
+ # * log_path : Allows you to override the default log file [Rail.root/log/documentalist.log]
4
+
5
+ development:
6
+ # Python configuration
7
+ python:
8
+ path: /usr/bin/python
9
+
10
+ # Java configuration
11
+ java:
12
+ path: /usr/bin/java
13
+
14
+ # OpenOffice configuration
15
+ open_office:
16
+ # Path to the OpenOpffice binary
17
+ path: /usr/bin/soffice
18
+
19
+ # Select desired bridge between PYOD and JOD
20
+ bridge: JOD
21
+
22
+ # Maximum allowed CPU usage before the process is considered stalled
23
+ max_cpu: 80
24
+
25
+ # OpenOffice server allowed startup time (seconds)
26
+ max_startup_time: 4
27
+
28
+ # OpenOffice server allowed waking up time (between startup and actual processing)
29
+ wakeup_time: 3
30
+
31
+ # Conversion tries before giving up
32
+ max_conversion_attempts: 3
33
+
34
+ # Maximum allowed time for converting a document
35
+ max_conversion_time: 6
36
+
37
+ test:
38
+ python:
39
+ path: /usr/bin/python
40
+
41
+ java:
42
+ path: /usr/bin/java
43
+
44
+ open_office:
45
+ path: /usr/bin/soffice
46
+ bridge: JOD
47
+ max_cpu: 80
48
+ max_startup_time: 4
49
+ wakeup_time: 3
50
+ max_conversion_attempts: 3
51
+ max_conversion_time: 6
52
+
53
+ production:
54
+ python:
55
+ path: /usr/bin/python
56
+
57
+ java:
58
+ path: /usr/bin/java
59
+
60
+ open_office:
61
+ path: /usr/bin/soffice
62
+ bridge: JOD
63
+ max_cpu: 80
64
+ max_startup_time: 4
65
+ wakeup_time: 3
66
+ max_conversion_attempts: 3
67
+ max_conversion_time: 6
@@ -1 +1,12 @@
1
- require 'documentalist'
1
+ require 'yaml'
2
+
3
+ require File.join(File.dirname(__FILE__), %w{initialize_configuration})
4
+ require File.join(File.dirname(__FILE__), %w{.. lib documentalist})
5
+
6
+ # Load configuration from Rails.root/config/documentalist.yml
7
+ Documentalist.config_from_yaml! File.join(RAILS_ROOT, %w{config documentalist.yml}), :section => RAILS_ENV
8
+
9
+ # Set a default for the logfile if it hasn't been provided by the configuration file
10
+ unless Documentalist.config[:logfile]
11
+ Documentalist.config[:logfile] = File.join(RAILS_ROOT, %w{log documentalist-#{RAILS_ENV}.log})
12
+ end
@@ -0,0 +1,6 @@
1
+ # Initializes a standard configuration file in Rails.root/config/documentalist.yml
2
+ unless File.exists?(File.join(RAILS_ROOT , %w{config documentalist.yml}))
3
+ FileUtils.cp(File.join(File.dirname(__FILE__), %w{config documentalist.yml.tpl}), File.join(RAILS_ROOT, %w{config documentalist.yml}))
4
+ end
5
+
6
+
@@ -1,40 +1,63 @@
1
- require 'test/unit'
2
- require 'documentalist'
1
+ require 'test_helper'
2
+ require 'system_timer'
3
+ require 'tmpdir'
3
4
 
4
5
  class DocumentalistTest < Test::Unit::TestCase
5
- @@odt_fixture = File.join(File.dirname(__FILE__), "fixtures/fixture.odt")
6
+ include FlexMock::TestCase
6
7
 
7
- def test_merge
8
- template = "<%= @var1 %><%= 1.upto(3).map{ |n| n.to_s }.join %><%= @var2 %>"
8
+ # Test the custom symbolize method used as a replacement for the Active Support version
9
+ def test_symbolize
10
+ hash = { "a" => "b",
11
+ "c" => {
12
+ "d" => "e"
13
+ }
14
+ }
9
15
 
10
- merged = Documentalist.merge(template, :locals => {
11
- :var1 => "test",
12
- :var2 => "working?"
16
+ symbolized = { :a => "b",
17
+ :c => {
18
+ :d => "e"
13
19
  }
14
- )
20
+ }
21
+
22
+ assert_equal Documentalist.send(:symbolize, hash),
23
+ symbolized,
24
+ "Hash wasn't properly symbolized"
25
+ end
15
26
 
16
- assert_equal "test123working?", merged, "Merge wasn't performed correctly"
27
+ # Test that we use a system timeout and not a green thread based timeout that
28
+ # could possibly not work on some long external system calls
29
+ def test_timeout_uses_system_timeout
30
+ flexmock(SystemTimer).should_receive(:timeout).once
31
+ Documentalist.timeout(0.1) { }
17
32
  end
18
33
 
19
- def test_read_zipped_odt
20
- contents = Documentalist.get_contents(@@odt_fixture)
34
+ # Test that we have a default configuration for Documentalist even if
35
+ def test_default_config
36
+ # Check that we did not get some Rails context from other tests
37
+ assert !Object.const_defined?(:RAILS_ENV)
21
38
 
22
- assert_match /Hello/, contents
23
- assert_match /thing/, contents
24
- assert !(contents =~ /%&gt;/)
25
- assert !(contents =~ /&lt;%=/)
39
+ # Check that at least a configuration key has been magically set
40
+ assert Documentalist.config[:open_office]
26
41
  end
27
42
 
28
- def test_odt_merge
29
- template = "#{File.join(File.dirname(__FILE__), "fixtures/fixture.odt")}"
30
- result = "#{File.join(File.dirname(__FILE__), "fixtures/result.odt")}"
43
+ def test_logger
44
+ log_file = File.join(Dir.tmpdir, "#{rand(10 ** 9).to_s}.log")
45
+
46
+ Documentalist.config[:log_file] = log_file
47
+ assert !File.exists?(log_file), "Log file already exists"
48
+
49
+ Documentalist.logger
50
+ assert File.exists?(log_file), "Log file should have been created"
51
+
52
+ assert_no_difference("File.size(\"#{log_file}\")", "Nothing should have been written") do
53
+ Documentalist.logger.debug("This message should go nowhere")
54
+ end
31
55
 
32
- Documentalist.merge_template(template,
33
- :locals => {:thing => "world"},
34
- :to => result
35
- )
56
+ assert_difference("File.size(\"#{log_file}\")", nil, "Nothing should have been written") do
57
+ Documentalist.logger.warn("This message should be written !")
58
+ end
36
59
 
37
- assert /world/, Documentalist.get_contents(result)
38
- File.delete(result)
60
+ FileUtils.rm(log_file)
61
+ assert !File.exists?(log_file), "Log file hasn't been removed properly"
39
62
  end
40
63
  end