chupa-text-decomposer-libreoffice 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +34 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +45 -0
  6. data/Rakefile +46 -0
  7. data/chupa-text-decomposer-libreoffice.gemspec +51 -0
  8. data/doc/text/news.md +5 -0
  9. data/lib/chupa-text/decomposers/libreoffice.rb +131 -0
  10. data/test/fixture/doc/attributes.doc +0 -0
  11. data/test/fixture/doc/multi-pages.doc +0 -0
  12. data/test/fixture/doc/one-page.doc +0 -0
  13. data/test/fixture/docx/attributes.docx +0 -0
  14. data/test/fixture/docx/multi-pages.docx +0 -0
  15. data/test/fixture/docx/one-page.docx +0 -0
  16. data/test/fixture/odp/attributes.odp +0 -0
  17. data/test/fixture/odp/multi-slides.odp +0 -0
  18. data/test/fixture/odp/one-slide.odp +0 -0
  19. data/test/fixture/ods/attributes.ods +0 -0
  20. data/test/fixture/ods/multi-sheets.ods +0 -0
  21. data/test/fixture/ods/one-sheet.ods +0 -0
  22. data/test/fixture/odt/attributes.odt +0 -0
  23. data/test/fixture/odt/multi-pages.odt +0 -0
  24. data/test/fixture/odt/one-page.odt +0 -0
  25. data/test/fixture/ppt/attributes.ppt +0 -0
  26. data/test/fixture/ppt/multi-slides.ppt +0 -0
  27. data/test/fixture/ppt/one-slide.ppt +0 -0
  28. data/test/fixture/pptx/attributes.pptx +0 -0
  29. data/test/fixture/pptx/multi-slides.pptx +0 -0
  30. data/test/fixture/pptx/one-slide.pptx +0 -0
  31. data/test/fixture/xls/attributes.xls +0 -0
  32. data/test/fixture/xls/multi-sheets.xls +0 -0
  33. data/test/fixture/xls/one-sheet.xls +0 -0
  34. data/test/fixture/xlsx/attributes.xlsx +0 -0
  35. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  36. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  37. data/test/helper.rb +45 -0
  38. data/test/run-test.rb +31 -0
  39. data/test/test-doc.rb +123 -0
  40. data/test/test-docx.rb +123 -0
  41. data/test/test-odp.rb +133 -0
  42. data/test/test-ods.rb +138 -0
  43. data/test/test-odt.rb +123 -0
  44. data/test/test-ppt.rb +133 -0
  45. data/test/test-pptx.rb +136 -0
  46. data/test/test-xls.rb +138 -0
  47. data/test/test-xlsx.rb +138 -0
  48. metadata +187 -0
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # README
2
+
3
+ ## Name
4
+
5
+ chupa-text-decomposer-libreoffice
6
+
7
+ ## Description
8
+
9
+ This is a ChupaText decomposer plugin for to extract text and
10
+ meta-data from office files such as Microsoft Word file, Microsoft
11
+ Excel file and OpenDocument Format file. It uses
12
+ [LibreOffice](https://www.libreoffice.org/).
13
+
14
+ You can use `libreoffice` decomposer.
15
+
16
+ It depends on `pdf` decomposer. Because it converts a office file to
17
+ PDF file and extracts text and meta-data by `pdf` decomposer.
18
+
19
+ ## Install
20
+
21
+ Install chupa-text-decomposer-libreoffice gem:
22
+
23
+ ```
24
+ % gem install chupa-text-decomposer-libreoffice
25
+ ```
26
+
27
+ Install
28
+ [LibreOffice from download page](http://www.libreoffice.org/download).
29
+
30
+ Now, you can extract text and meta-data from office files:
31
+
32
+ ```
33
+ % chupa-text document.doc
34
+ ```
35
+
36
+ ## Author
37
+
38
+ * Kouhei Sutou `<kou@clear-code.com>`
39
+
40
+ ## License
41
+
42
+ LGPL 2.1 or later.
43
+
44
+ (Kouhei Sutou has a right to change the license including contributed
45
+ patches.)
data/Rakefile ADDED
@@ -0,0 +1,46 @@
1
+ # -*- mode: ruby; coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ task :default => :test
20
+
21
+ require "pathname"
22
+
23
+ require "rubygems"
24
+ require "bundler/gem_helper"
25
+ require "packnga"
26
+
27
+ base_dir = Pathname(__FILE__).dirname
28
+
29
+ helper = Bundler::GemHelper.new(base_dir.to_s)
30
+ def helper.version_tag
31
+ version
32
+ end
33
+
34
+ helper.install
35
+ spec = helper.gemspec
36
+
37
+ Packnga::DocumentTask.new(spec) do
38
+ end
39
+
40
+ Packnga::ReleaseTask.new(spec) do
41
+ end
42
+
43
+ desc "Run tests"
44
+ task :test do
45
+ ruby("test/run-test.rb")
46
+ end
@@ -0,0 +1,51 @@
1
+ # -*- mode: ruby; coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ clean_white_space = lambda do |entry|
20
+ entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
21
+ end
22
+
23
+ Gem::Specification.new do |spec|
24
+ spec.name = "chupa-text-decomposer-libreoffice"
25
+ spec.version = "1.0.0"
26
+ spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-libreoffice"
27
+ spec.authors = ["Kouhei Sutou"]
28
+ spec.email = ["kou@clear-code.com"]
29
+ readme = File.read("README.md", :encoding => "UTF-8")
30
+ entries = readme.split(/^\#\#\s(.*)$/)
31
+ description = clean_white_space.call(entries[entries.index("Description") + 1])
32
+ spec.summary = description.split(/\n\n+/, 2).first
33
+ spec.description = description
34
+ spec.license = "LGPLv2.1 or later"
35
+ spec.files = ["#{spec.name}.gemspec"]
36
+ spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
37
+ spec.files += [".yardopts"]
38
+ spec.files += Dir.glob("lib/**/*.rb")
39
+ spec.files += Dir.glob("doc/text/*")
40
+ spec.files += Dir.glob("test/**/*")
41
+
42
+ spec.requirements << "LibreOffice"
43
+
44
+ spec.add_runtime_dependency("chupa-text-decomposer-pdf")
45
+
46
+ spec.add_development_dependency("bundler")
47
+ spec.add_development_dependency("rake")
48
+ spec.add_development_dependency("test-unit")
49
+ spec.add_development_dependency("packnga")
50
+ spec.add_development_dependency("redcarpet")
51
+ end
data/doc/text/news.md ADDED
@@ -0,0 +1,5 @@
1
+ # News
2
+
3
+ ## 1.0.0: 2014-02-16
4
+
5
+ The first release!!!
@@ -0,0 +1,131 @@
1
+ # Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "tempfile"
18
+ require "tmpdir"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class LibreOffice < Decomposer
23
+ registry.register("libreoffice", self)
24
+
25
+ def initialize(options)
26
+ super
27
+ @command = find_command
28
+ end
29
+
30
+ TARGET_EXTENSIONS = [
31
+ "odt",
32
+ "ods",
33
+ "odp",
34
+ "doc",
35
+ "xls",
36
+ "ppt",
37
+ "docx",
38
+ "xlsx",
39
+ "pptx",
40
+ ]
41
+ TARGET_MIME_TYPES = [
42
+ "application/vnd.oasis.opendocument.text",
43
+ "application/vnd.oasis.opendocument.presentation",
44
+ "application/vnd.oasis.opendocument.spreadsheet",
45
+ "application/msword",
46
+ "application/vnd.ms-excel",
47
+ "application/vnd.ms-powerpoint",
48
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
49
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
50
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
51
+ ]
52
+ def target?(data)
53
+ return false if @command.nil?
54
+ TARGET_EXTENSIONS.include?(data.extension) or
55
+ TARGET_MIME_TYPES.include?(data.mime_type)
56
+ end
57
+
58
+ def decompose(data)
59
+ pdf_data = convert_to_pdf(data)
60
+ return if pdf_data.nil?
61
+ yield(pdf_data)
62
+ end
63
+
64
+ private
65
+ def find_command
66
+ candidates = [
67
+ @options[:libreoffice],
68
+ ENV["LIBREOFFICE"],
69
+ "libreoffice",
70
+ "soffice",
71
+ ]
72
+ candidates.each do |candidate|
73
+ next if candidate.nil?
74
+ command = ExternalCommand.new(candidate)
75
+ return command if command.exist?
76
+ expanded_candidate = expand_candidate(candidate)
77
+ next if expanded_candidate.nil?
78
+ command = ExternalCommand.new(expanded_candidate)
79
+ return command if command.exist?
80
+ end
81
+ nil
82
+ end
83
+
84
+ def expand_candidate(candidate)
85
+ Dir.glob("/opt/libreoffice*/program/#{candidate}").first
86
+ end
87
+
88
+ def convert_to_pdf(data)
89
+ Dir.mktmpdir do |temporary_directory|
90
+ output = Tempfile.new("chupa-text-decomposer-libreoffice-output")
91
+ error = Tempfile.new("chupa-text-decomposer-libreoffice-error")
92
+ succeeded = @command.run("--headless",
93
+ "--nologo",
94
+ "--convert-to", "pdf",
95
+ "--outdir", temporary_directory,
96
+ data.path,
97
+ {
98
+ :spawn_options => {
99
+ :out => output.path,
100
+ :err => error.path,
101
+ },
102
+ })
103
+ unless succeeded
104
+ tag = "[decomposer][libreoffice][convert][exited][abnormally]"
105
+ message = [
106
+ tag,
107
+ "output: <#{output.read}>",
108
+ "error: <#{error.read}>",
109
+ ].join("\n")
110
+ ChupaText.logger.error(message)
111
+ return nil
112
+ end
113
+ pdf_path, = Dir.glob("#{temporary_directory}/*.pdf")
114
+ if pdf_path.nil?
115
+ tag = "[decomposer][libreoffice][convert][failed]"
116
+ message = [
117
+ "#{tag}: LibreOffice may be running",
118
+ "output: <#{output.read}>",
119
+ "error: <#{error.read}>",
120
+ ].join("\n")
121
+ ChupaText.logger.error(message)
122
+ return nil
123
+ end
124
+ File.open(pdf_path, "rb") do |pdf|
125
+ ChupaText::VirtualFileData.new(pdf_path, pdf)
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/test/helper.rb ADDED
@@ -0,0 +1,45 @@
1
+ # Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module FixtureHelper
18
+ def setup_decomposer
19
+ @decomposer = ChupaText::Decomposers::LibreOffice.new({})
20
+ end
21
+
22
+ def fixture_path(*components)
23
+ base_path = Pathname(__FILE__).dirname + "fixture"
24
+ base_path.join(*components)
25
+ end
26
+ end
27
+
28
+ module DecomposeHelper
29
+ def decompose(path)
30
+ data = ChupaText::InputData.new(path)
31
+
32
+ pdf_decomposer = ChupaText::Decomposers::PDF.new({})
33
+ decomposed = []
34
+ @decomposer.decompose(data) do |decomposed_data|
35
+ if pdf_decomposer.target?(decomposed_data)
36
+ pdf_decomposer.decompose(decomposed_data) do |pdf_decomposed_data|
37
+ decomposed << pdf_decomposed_data
38
+ end
39
+ else
40
+ decomposed << decomposed_data
41
+ end
42
+ end
43
+ decomposed
44
+ end
45
+ end
data/test/run-test.rb ADDED
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ $VERBOSE = true
20
+
21
+ require "bundler/setup"
22
+
23
+ require "test-unit"
24
+
25
+ require "chupa-text"
26
+
27
+ ChupaText::Decomposers.load
28
+
29
+ require_relative "helper"
30
+
31
+ exit(Test::Unit::AutoRunner.run(true))
data/test/test-doc.rb ADDED
@@ -0,0 +1,123 @@
1
+ # Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+
19
+ class TestDoc < Test::Unit::TestCase
20
+ include FixtureHelper
21
+
22
+ def setup
23
+ setup_decomposer
24
+ end
25
+
26
+ def fixture_path(*components)
27
+ super("doc", *components)
28
+ end
29
+
30
+ sub_test_case("target?") do
31
+ sub_test_case("extension") do
32
+ def create_data(uri)
33
+ data = ChupaText::Data.new
34
+ data.body = ""
35
+ data.uri = uri
36
+ data
37
+ end
38
+
39
+ def test_doc
40
+ assert_true(@decomposer.target?(create_data("document.doc")))
41
+ end
42
+ end
43
+
44
+ sub_test_case("mime-type") do
45
+ def create_data(mime_type)
46
+ data = ChupaText::Data.new
47
+ data.mime_type = mime_type
48
+ data
49
+ end
50
+
51
+ def test_ms_word
52
+ mime_type = "application/msword"
53
+ assert_true(@decomposer.target?(create_data(mime_type)))
54
+ end
55
+ end
56
+ end
57
+
58
+ sub_test_case("decompose") do
59
+ include DecomposeHelper
60
+
61
+ sub_test_case("attributes") do
62
+ def test_title
63
+ assert_equal(["Title"], decompose("title"))
64
+ end
65
+
66
+ def test_author
67
+ assert_equal([nil], decompose("author"))
68
+ end
69
+
70
+ def test_subject
71
+ assert_equal(["Subject"], decompose("subject"))
72
+ end
73
+
74
+ def test_keywords
75
+ assert_equal(["Keyword1, Keyword2"], decompose("keywords"))
76
+ end
77
+
78
+ def test_creator
79
+ assert_equal(["Writer"], decompose("creator"))
80
+ end
81
+
82
+ def test_producer
83
+ assert_equal(["LibreOffice 4.1"], decompose("producer"))
84
+ end
85
+
86
+ def test_creation_date
87
+ assert_equal([nil], decompose("creation_date"))
88
+ end
89
+
90
+ private
91
+ def decompose(attribute_name)
92
+ super(fixture_path("attributes.doc")).collect do |data|
93
+ data[attribute_name]
94
+ end
95
+ end
96
+ end
97
+
98
+ sub_test_case("one page") do
99
+ def test_body
100
+ assert_equal(["Page1"], decompose.collect(&:body))
101
+ end
102
+
103
+ private
104
+ def decompose
105
+ super(fixture_path("one-page.doc"))
106
+ end
107
+ end
108
+
109
+ sub_test_case("multi pages") do
110
+ def test_body
111
+ assert_equal([<<-BODY.chomp], decompose.collect(&:body))
112
+ Page1
113
+ Page2
114
+ BODY
115
+ end
116
+
117
+ private
118
+ def decompose
119
+ super(fixture_path("multi-pages.doc"))
120
+ end
121
+ end
122
+ end
123
+ end