documentalist 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ README.rdoc
2
+ Rakefile
3
+ documentalist.gemspec
4
+ init.rb
5
+ lib/DocumentConverter.py
6
+ lib/documentalist.rb
7
+ lib/open_office/server.rb
8
+ rails/init.rb
9
+ test/documentalist_test.rb
10
+ test/fixtures/fixture.odt
11
+ test/open_office_test.rb
12
+ Manifest
@@ -0,0 +1,3 @@
1
+ = Documentalist
2
+
3
+ Rails gem for talking to OpenOffice.
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('documentalist', '0.1.0') do |p|
6
+ p.description = "Ruby interface that talks to OpenOffice"
7
+ p.url = "http://github.com/davout/documentalist"
8
+ p.author = "David FRANCOIS"
9
+ p.email = "david.francois@webflows.fr"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.test_pattern = "test/**/*.rb"
12
+ p.development_dependencies = []
13
+ p.runtime_dependencies = ['zip >=2.0.2']
14
+ end
15
+
16
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{documentalist}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["David FRANCOIS"]
9
+ s.date = %q{2010-06-08}
10
+ s.description = %q{Ruby interface that talks to OpenOffice}
11
+ s.email = %q{david.francois@webflows.fr}
12
+ s.extra_rdoc_files = ["README.rdoc", "lib/DocumentConverter.py", "lib/documentalist.rb", "lib/open_office/server.rb"]
13
+ s.files = ["README.rdoc", "Rakefile", "documentalist.gemspec", "init.rb", "lib/DocumentConverter.py", "lib/documentalist.rb", "lib/open_office/server.rb", "rails/init.rb", "test/documentalist_test.rb", "test/fixtures/fixture.odt", "test/open_office_test.rb", "Manifest"]
14
+ s.homepage = %q{http://github.com/davout/documentalist}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Documentalist", "--main", "README.rdoc"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{documentalist}
18
+ s.rubygems_version = %q{1.3.7}
19
+ s.summary = %q{Ruby interface that talks to OpenOffice}
20
+ s.test_files = ["test/documentalist_test.rb", "test/open_office_test.rb"]
21
+
22
+ if s.respond_to? :specification_version then
23
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
24
+ s.specification_version = 3
25
+
26
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
27
+ s.add_runtime_dependency(%q<zip>, [">= 2.0.2"])
28
+ else
29
+ s.add_dependency(%q<zip>, [">= 2.0.2"])
30
+ end
31
+ else
32
+ s.add_dependency(%q<zip>, [">= 2.0.2"])
33
+ end
34
+ end
data/init.rb ADDED
File without changes
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/python
2
+ #
3
+ # PyODConverter (Python OpenDocument Converter) v1.0.0 - 2008-05-05
4
+ #
5
+ # This script converts a document from one office format to another by
6
+ # connecting to an OpenOffice.org instance via Python-UNO bridge.
7
+ #
8
+ # Copyright (C) 2008 Mirko Nasato <mirko@artofsolving.com>
9
+ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
10
+ # - or any later version.
11
+ #
12
+ DEFAULT_OPENOFFICE_PORT = 8100
13
+
14
+ import uno
15
+ from os.path import abspath, isfile, splitext
16
+ from com.sun.star.beans import PropertyValue
17
+ from com.sun.star.task import ErrorCodeIOException
18
+ from com.sun.star.connection import NoConnectException
19
+
20
+ FAMILY_TEXT = "Text"
21
+ FAMILY_SPREADSHEET = "Spreadsheet"
22
+ FAMILY_PRESENTATION = "Presentation"
23
+ FAMILY_DRAWING = "Drawing"
24
+
25
+ FILTER_MAP = {
26
+ "pdf": {
27
+ FAMILY_TEXT: "writer_pdf_Export",
28
+ FAMILY_SPREADSHEET: "calc_pdf_Export",
29
+ FAMILY_PRESENTATION: "impress_pdf_Export",
30
+ FAMILY_DRAWING: "draw_pdf_Export"
31
+ },
32
+ "html": {
33
+ FAMILY_TEXT: "HTML (StarWriter)",
34
+ FAMILY_SPREADSHEET: "HTML (StarCalc)",
35
+ FAMILY_PRESENTATION: "impress_html_Export"
36
+ },
37
+ "odt": { FAMILY_TEXT: "writer8" },
38
+ "doc": { FAMILY_TEXT: "MS Word 97" },
39
+ "rtf": { FAMILY_TEXT: "Rich Text Format" },
40
+ "txt": { FAMILY_TEXT: "Text" },
41
+ "ods": { FAMILY_SPREADSHEET: "calc8" },
42
+ "xls": { FAMILY_SPREADSHEET: "MS Excel 97" },
43
+ "odp": { FAMILY_PRESENTATION: "impress8" },
44
+ "ppt": { FAMILY_PRESENTATION: "MS PowerPoint 97" },
45
+ "swf": { FAMILY_PRESENTATION: "impress_flash_Export" }
46
+ }
47
+ # see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
48
+ # for more available filters
49
+
50
+
51
+ class DocumentConversionException(Exception):
52
+
53
+ def __init__(self, message):
54
+ self.message = message
55
+
56
+ def __str__(self):
57
+ return self.message
58
+
59
+
60
+ class DocumentConverter:
61
+
62
+ def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
63
+ localContext = uno.getComponentContext()
64
+ resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
65
+ try:
66
+ context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
67
+ except NoConnectException:
68
+ raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
69
+ self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
70
+
71
+ def convert(self, inputFile, outputFile):
72
+
73
+ inputUrl = self._toFileUrl(inputFile)
74
+ outputUrl = self._toFileUrl(outputFile)
75
+
76
+ document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(Hidden=True))
77
+ try:
78
+ document.refresh()
79
+ except AttributeError:
80
+ pass
81
+
82
+ outputExt = self._getFileExt(outputFile)
83
+ filterName = self._filterName(document, outputExt)
84
+
85
+ try:
86
+ document.storeToURL(outputUrl, self._toProperties(FilterName=filterName))
87
+ finally:
88
+ document.close(True)
89
+
90
+ def _filterName(self, document, outputExt):
91
+ family = self._detectFamily(document)
92
+ try:
93
+ filterByFamily = FILTER_MAP[outputExt]
94
+ except KeyError:
95
+ raise DocumentConversionException, "unknown output format: '%s'" % outputExt
96
+ try:
97
+ return filterByFamily[family]
98
+ except KeyError:
99
+ raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
100
+
101
+ def _detectFamily(self, document):
102
+ if document.supportsService("com.sun.star.text.GenericTextDocument"):
103
+ # NOTE: a GenericTextDocument is either a TextDocument, a WebDocument, or a GlobalDocument
104
+ # but this further distinction doesn't seem to matter for conversions
105
+ return FAMILY_TEXT
106
+ if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
107
+ return FAMILY_SPREADSHEET
108
+ if document.supportsService("com.sun.star.presentation.PresentationDocument"):
109
+ return FAMILY_PRESENTATION
110
+ if document.supportsService("com.sun.star.drawing.DrawingDocument"):
111
+ return FAMILY_DRAWING
112
+ raise DocumentConversionException, "unknown document family: %s" % document
113
+
114
+ def _getFileExt(self, path):
115
+ ext = splitext(path)[1]
116
+ if ext is not None:
117
+ return ext[1:].lower()
118
+
119
+ def _toFileUrl(self, path):
120
+ return uno.systemPathToFileUrl(abspath(path))
121
+
122
+ def _toProperties(self, **args):
123
+ props = []
124
+ for key in args:
125
+ prop = PropertyValue()
126
+ prop.Name = key
127
+ prop.Value = args[key]
128
+ props.append(prop)
129
+ return tuple(props)
130
+
131
+
132
+ if __name__ == "__main__":
133
+ from sys import argv, exit
134
+
135
+ if len(argv) < 3:
136
+ print "USAGE: python %s <input-file> <output-file>" % argv[0]
137
+ exit(255)
138
+ if not isfile(argv[1]):
139
+ print "no such input file: %s" % argv[1]
140
+ exit(1)
141
+
142
+ try:
143
+ converter = DocumentConverter()
144
+ converter.convert(argv[1], argv[2])
145
+ except DocumentConversionException, exception:
146
+ print "ERROR!" + str(exception)
147
+ exit(1)
148
+ except ErrorCodeIOException, exception:
149
+ print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
150
+ exit(1)
151
+
@@ -0,0 +1,63 @@
1
+ require 'rubygems'
2
+ require 'erb'
3
+ require 'fileutils'
4
+ require 'tmpdir'
5
+ require 'zip/zip'
6
+ require 'open_office/server'
7
+
8
+ module Documentalist
9
+ def self.merge(str, options = {})
10
+ locals = options[:locals]
11
+
12
+ if locals and locals.is_a? Hash
13
+ locals.each do |k,v|
14
+ instance_variable_set("@#{k.to_s}".to_sym, v)
15
+ end
16
+ end
17
+
18
+ ERB.new(str).result(binding)
19
+ end
20
+
21
+ def self.get_contents(odt_file)
22
+ contents = ""
23
+ Zip::ZipFile.open(odt_file) { |zip| contents = zip.read("content.xml") }
24
+ contents.gsub("&lt;%", "<%").gsub("%&gt;", "%>")
25
+ end
26
+
27
+ def self.merge_template(template, options = {})
28
+ # Get template contents
29
+ tmp_contents= Tempfile.new("officer-contents")
30
+ tmp_contents.write(merge(get_contents(template), :locals => options[:locals]))
31
+ tmp_contents.close
32
+
33
+ # Copy the template so we can merge the data into the copy
34
+ tmp_merged_template = File.join(Dir.tmpdir, "merged-template-#{rand(10**9)}#{File.extname(template)}")
35
+ FileUtils.cp(template, tmp_merged_template)
36
+
37
+ # Stuff the merged contents.xml into the OpenDocument zip
38
+ Zip::ZipFile.open(tmp_merged_template) do |zip|
39
+ zip.replace("content.xml", tmp_contents.path)
40
+ zip.commit
41
+ end
42
+
43
+ # Remove the merged contents.xml
44
+ tmp_contents.unlink
45
+
46
+ # Manages the converted file depending on the context
47
+ if options[:to]
48
+ if File.extname(options[:to]) == File.extname(template)
49
+ FileUtils.mv(tmp_merged_template, options[:to])
50
+ else
51
+ OpenOffice::Server.convert(tmp_merged_template, options[:to])
52
+ FileUtils.rm(tmp_merged_template)
53
+ end
54
+ else
55
+ FileUtils.rm(template)
56
+ FileUtils.mv(tmp_merged_template, template)
57
+ end
58
+ end
59
+
60
+ def self.convert(from, to)
61
+ OpenOffice::Server.convert(from, :to => to)
62
+ end
63
+ end
@@ -0,0 +1,148 @@
1
+ require 'timeout'
2
+ require 'tmpdir'
3
+
4
+ module OpenOffice
5
+ module Server
6
+ # Path to the Python executable
7
+ PYTHON_PATH = "/usr/bin/python"
8
+
9
+ # Server can convert from the following file formats
10
+ CONVERT_FROM = [:odt, :doc, :rtf, :docx, :txt, :html, :htm, :wps]
11
+
12
+ # To the following formats
13
+ CONVERT_TO = [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]
14
+
15
+ # Python conversion script path
16
+ PY_OD_CONVERTER = File.join(File.dirname(__FILE__), "../DocumentConverter.py")
17
+
18
+ # Maximum allowed CPU usage for an OpenOffice process
19
+ MAX_CPU = 80
20
+
21
+ # Server start grace time
22
+ SERVER_START_DELAY = 4
23
+
24
+ # Log file
25
+ LOG_FILE = Object.const_defined?(:RAILS_ROOT) ? File.join(RAILS_ROOT, "log", "openoffice.log") : ""
26
+
27
+ def self.convert(origin, options = {:to => :txt})
28
+ if options and options[:to]
29
+ raise "#{origin} does not exist !" unless File.exist?(origin)
30
+ ensure_available
31
+
32
+ if options[:to].is_a? Symbol
33
+ destination = "#{origin.gsub(/[^\.]*$/, "")}#{options[:to].to_s}"
34
+ elsif options[:to].is_a? String
35
+ destination = options[:to]
36
+ else
37
+ raise "Can't convert #{origin} to #{options[:to]}"
38
+ end
39
+
40
+ timeout(10, :attempts => 2) do
41
+ system("#{PYTHON_PATH} #{PY_OD_CONVERTER} #{origin} #{destination} > /dev/null 2>&1")
42
+
43
+ # HACK : sometimes text files get saved in ISO-8859-1 instead of regular UTF-8, so we force
44
+ # a conversion if it's the case
45
+ if `file #{destination}` =~ /ISO/ and destination =~ /\.txt$/
46
+ temp_file = File.join(Dir.tmpdir, "tmp_iconv_#{rand(10**9)}.txt")
47
+ system("iconv --from-code ISO-8859-1 --to-code UTF-8 #{destination} > && #{temp_file} mv #{temp_file} #{destination}")
48
+ end
49
+ end
50
+
51
+ destination
52
+ end
53
+ end
54
+
55
+ private
56
+ # Is OpenOffice server running?
57
+ def self.running?
58
+ !`pgrep office`.empty?
59
+ end
60
+
61
+ # Restart if running or start new instance
62
+ def self.restart!
63
+ kill! if running?
64
+ start!
65
+ end
66
+
67
+ # Start new instance
68
+ def self.start!
69
+ raise "Already running!" if running?
70
+ system("/usr/bin/soffice -headless -accept=\"socket,host=127.0.0.1,port=8100;urp;\" -nofirststartwizard -nologo -nocrashreport -norestore -nolockcheck -nodefault #{">>" unless LOG_FILE.empty?} #{LOG_FILE} 2>&1 &")
71
+
72
+ begin
73
+ timeout(2) do
74
+ while !running?
75
+ print "."
76
+ end
77
+ end
78
+ rescue
79
+ raise "Could not start OpenOffice"
80
+ end
81
+
82
+ # OpenOffice needs some time to wake up
83
+ sleep(SERVER_START_DELAY)
84
+
85
+ nil
86
+ end
87
+
88
+ # Kill running instance
89
+ def self.kill!
90
+ raise "Not running!" unless running?
91
+
92
+ begin
93
+ timeout(3, :attempts => 2) do
94
+ while(running?)
95
+ system("pkill -9 office")
96
+ end
97
+ end
98
+ rescue Timeout::Error
99
+ raise "Mayday, mayday ! Could not kill OpenOffice !!"
100
+ ensure
101
+ # Remove user profile
102
+ system("rm -rf ~/openoffice.org*")
103
+ end
104
+ end
105
+
106
+ # Is the current instance stuck ?
107
+ def self.stalled?
108
+ if running?
109
+ cpu_usage = `ps -Ao pcpu,pid,cmd | grep office`
110
+ cpu_usage = cpu_usage.split(/\n/).select{|line| /#{pids.join("|")}/.match(line) }
111
+
112
+ cpu_usage.any?{|usage| /^\s*\d+/.match(usage)[0].strip.to_i > MAX_CPU}
113
+ end
114
+ end
115
+
116
+ # Make sure there will be an available instance
117
+ def self.ensure_available
118
+ start! unless running?
119
+ restart! if stalled?
120
+ end
121
+
122
+ # Get OO processes pids
123
+ def self.pids
124
+ `pgrep office`.split.map{|pid| pid.to_i } unless `pgrep office`.empty?
125
+ end
126
+
127
+ # Run a block with a timeout and retry if the first execution fails
128
+ def self.timeout(max_time = 0, options = {:attempts => 1, :sleep => nil})
129
+ if block_given?
130
+ attempts = options[:attempts] || 1
131
+ begin
132
+ Timeout::timeout(max_time) do
133
+ yield
134
+ end
135
+ rescue Timeout::Error
136
+ attempts -= 1
137
+ sleep(options[:sleep]) if options[:sleep]
138
+ retry unless attempts.zero?
139
+ raise
140
+ end
141
+ end
142
+ end
143
+
144
+ def self.convertible?(origin, destination)
145
+ CONVERT_FROM.include?(File.extname(origin)) && CONVERT_TO.include?(File.extname(destination))
146
+ end
147
+ end
148
+ end
@@ -0,0 +1 @@
1
+ require 'documentalist'
@@ -0,0 +1,40 @@
1
+ require 'test/unit'
2
+ require 'documentalist'
3
+
4
+ class DocumentalistTest < Test::Unit::TestCase
5
+ @@odt_fixture = File.join(File.dirname(__FILE__), "fixtures/fixture.odt")
6
+
7
+ def test_merge
8
+ template = "<%= @var1 %><%= 1.upto(3).map{ |n| n.to_s }.join %><%= @var2 %>"
9
+
10
+ merged = Documentalist.merge(template, :locals => {
11
+ :var1 => "test",
12
+ :var2 => "working?"
13
+ }
14
+ )
15
+
16
+ assert_equal "test123working?", merged, "Merge wasn't performed correctly"
17
+ end
18
+
19
+ def test_read_zipped_odt
20
+ contents = Documentalist.get_contents(@@odt_fixture)
21
+
22
+ assert_match /Hello/, contents
23
+ assert_match /thing/, contents
24
+ assert !(contents =~ /%&gt;/)
25
+ assert !(contents =~ /&lt;%=/)
26
+ end
27
+
28
+ def test_odt_merge
29
+ template = "#{File.join(File.dirname(__FILE__), "fixtures/fixture.odt")}"
30
+ result = "#{File.join(File.dirname(__FILE__), "fixtures/result.odt")}"
31
+
32
+ Documentalist.merge_template(template,
33
+ :locals => {:thing => "world"},
34
+ :to => result
35
+ )
36
+
37
+ assert /world/, Documentalist.get_contents(result)
38
+ File.delete(result)
39
+ end
40
+ end
Binary file
@@ -0,0 +1,21 @@
1
+ require 'test/unit'
2
+ require 'documentalist'
3
+ require 'fileutils'
4
+ require 'tmpdir'
5
+
6
+ class OpenOfficeTest < Test::Unit::TestCase
7
+ def test_open_office
8
+ destination = File.join(Dir.tmpdir, "fixture#{rand(10**9)}.pdf")
9
+
10
+ Documentalist.convert(
11
+ File.join(File.dirname(__FILE__), "fixtures", "fixture.odt"),
12
+ destination
13
+ )
14
+
15
+ assert File.exist?(destination)
16
+
17
+ FileUtils.rm(destination)
18
+
19
+ assert !File.exist?(destination)
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: documentalist
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - David FRANCOIS
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-06-08 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: zip
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 11
30
+ segments:
31
+ - 2
32
+ - 0
33
+ - 2
34
+ version: 2.0.2
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ description: Ruby interface that talks to OpenOffice
38
+ email: david.francois@webflows.fr
39
+ executables: []
40
+
41
+ extensions: []
42
+
43
+ extra_rdoc_files:
44
+ - README.rdoc
45
+ - lib/DocumentConverter.py
46
+ - lib/documentalist.rb
47
+ - lib/open_office/server.rb
48
+ files:
49
+ - README.rdoc
50
+ - Rakefile
51
+ - documentalist.gemspec
52
+ - init.rb
53
+ - lib/DocumentConverter.py
54
+ - lib/documentalist.rb
55
+ - lib/open_office/server.rb
56
+ - rails/init.rb
57
+ - test/documentalist_test.rb
58
+ - test/fixtures/fixture.odt
59
+ - test/open_office_test.rb
60
+ - Manifest
61
+ has_rdoc: true
62
+ homepage: http://github.com/davout/documentalist
63
+ licenses: []
64
+
65
+ post_install_message:
66
+ rdoc_options:
67
+ - --line-numbers
68
+ - --inline-source
69
+ - --title
70
+ - Documentalist
71
+ - --main
72
+ - README.rdoc
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ hash: 3
81
+ segments:
82
+ - 0
83
+ version: "0"
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ hash: 11
90
+ segments:
91
+ - 1
92
+ - 2
93
+ version: "1.2"
94
+ requirements: []
95
+
96
+ rubyforge_project: documentalist
97
+ rubygems_version: 1.3.7
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Ruby interface that talks to OpenOffice
101
+ test_files:
102
+ - test/documentalist_test.rb
103
+ - test/open_office_test.rb