chupa-text 1.1.3 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +12 -0
  3. data/lib/chupa-text/command/chupa-text.rb +7 -1
  4. data/lib/chupa-text/decomposer.rb +8 -0
  5. data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
  6. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
  8. data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
  9. data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
  11. data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
  12. data/lib/chupa-text/decomposers/opendocument.rb +139 -0
  13. data/lib/chupa-text/extractor.rb +8 -2
  14. data/lib/chupa-text/formatters/mime.rb +3 -2
  15. data/lib/chupa-text/version.rb +1 -1
  16. data/test/decomposers/test-office-open-xml-document.rb +144 -0
  17. data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
  18. data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
  19. data/test/decomposers/test-open-document-presentation.rb +136 -0
  20. data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
  21. data/test/decomposers/test-open-document-text.rb +144 -0
  22. data/test/fixture/docx/attributes.docx +0 -0
  23. data/test/fixture/docx/multi-pages.docx +0 -0
  24. data/test/fixture/docx/one-page.docx +0 -0
  25. data/test/fixture/docx/special-characters.docx +0 -0
  26. data/test/fixture/odp/attributes.odp +0 -0
  27. data/test/fixture/odp/multi-slides.odp +0 -0
  28. data/test/fixture/odp/one-slide.odp +0 -0
  29. data/test/fixture/ods/attributes.ods +0 -0
  30. data/test/fixture/ods/multi-sheets.ods +0 -0
  31. data/test/fixture/ods/one-sheet.ods +0 -0
  32. data/test/fixture/odt/attributes.odt +0 -0
  33. data/test/fixture/odt/multi-pages.odt +0 -0
  34. data/test/fixture/odt/one-page.odt +0 -0
  35. data/test/fixture/odt/special-characters.odt +0 -0
  36. data/test/fixture/pptx/attributes.pptx +0 -0
  37. data/test/fixture/pptx/multi-slides.pptx +0 -0
  38. data/test/fixture/pptx/one-slide.pptx +0 -0
  39. data/test/fixture/xlsx/attributes.xlsx +0 -0
  40. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  41. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  42. metadata +36 -2
@@ -99,9 +99,15 @@ module ChupaText
99
99
  end
100
100
 
101
101
  def find_decomposer(data)
102
- @decomposers.find do |decomposer|
103
- decomposer.target?(data)
102
+ candidates = []
103
+ @decomposers.each do |decomposer|
104
+ score = decomposer.target_score(data)
105
+ next if score.nil?
106
+ candidates << [score, decomposer]
104
107
  end
108
+ return nil if candidates.empty?
109
+ candidate = candidates.sort_by {|score, _| score}.first
110
+ candidate[1]
105
111
  end
106
112
 
107
113
  def log_tag
@@ -22,9 +22,10 @@ require "chupa-text/formatters/hash"
22
22
  module ChupaText
23
23
  module Formatters
24
24
  class MIME < Hash
25
- def initialize(output)
25
+ def initialize(output, options={})
26
26
  super()
27
27
  @output = output
28
+ @boundary = options[:boundary]
28
29
  end
29
30
 
30
31
  def format_finish(data)
@@ -33,7 +34,7 @@ module ChupaText
33
34
  @output << "MIME-Version: 1.0\r\n"
34
35
  format_hash(formatted, ["texts"])
35
36
  texts = formatted["texts"]
36
- boundary = Digest::SHA1.hexdigest(data.uri.to_s)
37
+ boundary = @boundary || Digest::SHA1.hexdigest(data.uri.to_s)
37
38
  @output << "Content-Type: multipart/mixed; boundary=#{boundary}\r\n"
38
39
  texts.each do |text|
39
40
  @output << "\r\n--#{boundary}\r\n"
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.1.3"
18
+ VERSION = "1.1.4"
19
19
  end
@@ -0,0 +1,144 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposersOfficeOpenXMLDocument < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @decomposer = ChupaText::Decomposers::OfficeOpenXMLDocument.new({})
22
+ end
23
+
24
+ def decompose(path)
25
+ data = ChupaText::InputData.new(path)
26
+ decomposed = []
27
+ @decomposer.decompose(data) do |decomposed_data|
28
+ decomposed << decomposed_data
29
+ end
30
+ decomposed
31
+ end
32
+
33
+ sub_test_case("#target_score") do
34
+ def test_extension
35
+ data = ChupaText::Data.new
36
+ data.body = ""
37
+ data.uri = "document.docx"
38
+ assert_equal(-1, @decomposer.target_score(data))
39
+ end
40
+
41
+ def test_mime_type
42
+ data = ChupaText::Data.new
43
+ data.mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
44
+ assert_equal(-1, @decomposer.target_score(data))
45
+ end
46
+ end
47
+
48
+ sub_test_case("#decompose") do
49
+ sub_test_case("attributes") do
50
+ def decompose(attribute_name)
51
+ super(fixture_path("docx", "attributes.docx")).collect do |data|
52
+ data[attribute_name]
53
+ end
54
+ end
55
+
56
+ def test_title
57
+ assert_equal(["Title"], decompose("title"))
58
+ end
59
+
60
+ def test_author
61
+ assert_equal([nil], decompose("author"))
62
+ end
63
+
64
+ def test_subject
65
+ assert_equal(["Subject"], decompose("subject"))
66
+ end
67
+
68
+ def test_keywords
69
+ assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
70
+ end
71
+
72
+ def test_created_time
73
+ assert_equal([Time],
74
+ decompose("created_time").collect(&:class))
75
+ end
76
+
77
+ def test_modified_time
78
+ assert_equal([Time],
79
+ decompose("modified_time").collect(&:class))
80
+ end
81
+
82
+ def test_application
83
+ assert_equal(["LibreOffice"],
84
+ normalize_applications(decompose("application")))
85
+ end
86
+
87
+ def normalize_applications(applications)
88
+ applications.collect do |application|
89
+ normalize_application(application)
90
+ end
91
+ end
92
+
93
+ def normalize_application(application)
94
+ if application.start_with?("LibreOffice")
95
+ "LibreOffice"
96
+ else
97
+ application
98
+ end
99
+ end
100
+
101
+ def test_creation_date
102
+ assert_equal([nil], decompose("creation_date"))
103
+ end
104
+ end
105
+
106
+ sub_test_case("one page") do
107
+ def decompose
108
+ super(fixture_path("docx", "one-page.docx"))
109
+ end
110
+
111
+ def test_body
112
+ assert_equal(["Page1\n"], decompose.collect(&:body))
113
+ end
114
+ end
115
+
116
+ sub_test_case("multi pages") do
117
+ def decompose
118
+ super(fixture_path("docx", "multi-pages.docx"))
119
+ end
120
+
121
+ def test_body
122
+ assert_equal([<<-BODY], decompose.collect(&:body))
123
+ Page1
124
+ Page2
125
+ BODY
126
+ end
127
+ end
128
+
129
+ sub_test_case("special characters") do
130
+ def decompose
131
+ super(fixture_path("docx", "special-characters.docx"))
132
+ end
133
+
134
+ def test_body
135
+ assert_equal([<<-BODY], decompose.collect(&:body))
136
+ Ampersand: &
137
+ Reference: &amp;
138
+ HTML: <a href="">
139
+ Single quote: ''
140
+ BODY
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,133 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @decomposer = ChupaText::Decomposers::OfficeOpenXMLPresentation.new({})
22
+ end
23
+
24
+ def decompose(path)
25
+ data = ChupaText::InputData.new(path)
26
+ decomposed = []
27
+ @decomposer.decompose(data) do |decomposed_data|
28
+ decomposed << decomposed_data
29
+ end
30
+ decomposed
31
+ end
32
+
33
+ sub_test_case("#target_score") do
34
+ def test_extension
35
+ data = ChupaText::Data.new
36
+ data.body = ""
37
+ data.uri = "presentation.pptx"
38
+ assert_equal(-1, @decomposer.target_score(data))
39
+ end
40
+
41
+ def test_mime_type
42
+ data = ChupaText::Data.new
43
+ data.mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
44
+ assert_equal(-1, @decomposer.target_score(data))
45
+ end
46
+ end
47
+
48
+ sub_test_case("#decompose") do
49
+ sub_test_case("attributes") do
50
+ def decompose(attribute_name)
51
+ super(fixture_path("pptx", "attributes.pptx")).collect do |data|
52
+ data[attribute_name]
53
+ end
54
+ end
55
+
56
+ def test_title
57
+ assert_equal(["Title"], decompose("title"))
58
+ end
59
+
60
+ def test_author
61
+ assert_equal([nil], decompose("author"))
62
+ end
63
+
64
+ def test_subject
65
+ assert_equal(["Subject"], decompose("subject"))
66
+ end
67
+
68
+ def test_keywords
69
+ assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
70
+ end
71
+
72
+ def test_modified_time
73
+ assert_equal([Time],
74
+ decompose("modified_time").collect(&:class))
75
+ end
76
+
77
+ def test_application
78
+ assert_equal(["LibreOffice"],
79
+ normalize_applications(decompose("application")))
80
+ end
81
+
82
+ def normalize_applications(applications)
83
+ applications.collect do |application|
84
+ normalize_application(application)
85
+ end
86
+ end
87
+
88
+ def normalize_application(application)
89
+ if application.start_with?("LibreOffice")
90
+ "LibreOffice"
91
+ else
92
+ application
93
+ end
94
+ end
95
+
96
+ def test_creation_date
97
+ assert_equal([nil], decompose("creation_date"))
98
+ end
99
+ end
100
+
101
+ sub_test_case("one slide") do
102
+ def decompose
103
+ super(fixture_path("pptx", "one-slide.pptx"))
104
+ end
105
+
106
+ def test_body
107
+ assert_equal([<<-BODY], decompose.collect(&:body))
108
+ Slide1 title
109
+ Slide1 content
110
+ BODY
111
+ end
112
+ end
113
+
114
+ sub_test_case("multi slides") do
115
+ def decompose
116
+ super(fixture_path("pptx", "multi-slides.pptx"))
117
+ end
118
+
119
+ def test_body
120
+ assert_equal([<<-BODY], decompose.collect(&:body))
121
+ Slide1 title
122
+ Slide1 content
123
+
124
+ Slide2 title
125
+ Slide2 content
126
+
127
+ Slide3 title
128
+ Slide3 content
129
+ BODY
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,138 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @decomposer = ChupaText::Decomposers::OfficeOpenXMLWorkbook.new({})
22
+ end
23
+
24
+ def decompose(path)
25
+ data = ChupaText::InputData.new(path)
26
+ decomposed = []
27
+ @decomposer.decompose(data) do |decomposed_data|
28
+ decomposed << decomposed_data
29
+ end
30
+ decomposed
31
+ end
32
+
33
+ sub_test_case("#target_score") do
34
+ def test_extension
35
+ data = ChupaText::Data.new
36
+ data.body = ""
37
+ data.uri = "workbook.xlsx"
38
+ assert_equal(-1, @decomposer.target_score(data))
39
+ end
40
+
41
+ def test_mime_type
42
+ data = ChupaText::Data.new
43
+ data.mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
44
+ assert_equal(-1, @decomposer.target_score(data))
45
+ end
46
+ end
47
+
48
+ sub_test_case("#decompose") do
49
+ sub_test_case("attributes") do
50
+ def decompose(attribute_name)
51
+ super(fixture_path("xlsx", "attributes.xlsx")).collect do |data|
52
+ data[attribute_name]
53
+ end
54
+ end
55
+
56
+ def test_title
57
+ assert_equal(["Title"], decompose("title"))
58
+ end
59
+
60
+ def test_author
61
+ assert_equal([nil], decompose("author"))
62
+ end
63
+
64
+ def test_subject
65
+ assert_equal(["Subject"], decompose("subject"))
66
+ end
67
+
68
+ def test_keywords
69
+ assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
70
+ end
71
+
72
+ def test_created_time
73
+ assert_equal([Time],
74
+ decompose("created_time").collect(&:class))
75
+ end
76
+
77
+ def test_modified_time
78
+ assert_equal([Time],
79
+ decompose("modified_time").collect(&:class))
80
+ end
81
+
82
+ def test_application
83
+ assert_equal(["LibreOffice"],
84
+ normalize_applications(decompose("application")))
85
+ end
86
+
87
+ def normalize_applications(applications)
88
+ applications.collect do |application|
89
+ normalize_application(application)
90
+ end
91
+ end
92
+
93
+ def normalize_application(application)
94
+ if application.start_with?("LibreOffice")
95
+ "LibreOffice"
96
+ else
97
+ application
98
+ end
99
+ end
100
+
101
+ def test_creation_date
102
+ assert_equal([nil], decompose("creation_date"))
103
+ end
104
+ end
105
+
106
+ sub_test_case("one sheet") do
107
+ def decompose
108
+ super(fixture_path("xlsx", "one-sheet.xlsx"))
109
+ end
110
+
111
+ def test_body
112
+ assert_equal([<<-BODY], decompose.collect(&:body))
113
+ Sheet1 - A1\tSheet1 - B1
114
+ Sheet1 - A2\tSheet1 - B2
115
+ BODY
116
+ end
117
+ end
118
+
119
+ sub_test_case("multi sheets") do
120
+ def decompose
121
+ super(fixture_path("xlsx", "multi-sheets.xlsx"))
122
+ end
123
+
124
+ def test_body
125
+ assert_equal([<<-BODY], decompose.collect(&:body))
126
+ Sheet1 - A1\tSheet1 - B1
127
+ Sheet1 - A2\tSheet1 - B2
128
+
129
+ Sheet2 - A1\tSheet2 - B1
130
+ Sheet2 - A2\tSheet2 - B2
131
+
132
+ Sheet3 - A1\tSheet3 - B1
133
+ Sheet3 - A2\tSheet3 - B2
134
+ BODY
135
+ end
136
+ end
137
+ end
138
+ end