chupa-text 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +12 -0
  3. data/lib/chupa-text/command/chupa-text.rb +7 -1
  4. data/lib/chupa-text/decomposer.rb +8 -0
  5. data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
  6. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
  8. data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
  9. data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
  11. data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
  12. data/lib/chupa-text/decomposers/opendocument.rb +139 -0
  13. data/lib/chupa-text/extractor.rb +8 -2
  14. data/lib/chupa-text/formatters/mime.rb +3 -2
  15. data/lib/chupa-text/version.rb +1 -1
  16. data/test/decomposers/test-office-open-xml-document.rb +144 -0
  17. data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
  18. data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
  19. data/test/decomposers/test-open-document-presentation.rb +136 -0
  20. data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
  21. data/test/decomposers/test-open-document-text.rb +144 -0
  22. data/test/fixture/docx/attributes.docx +0 -0
  23. data/test/fixture/docx/multi-pages.docx +0 -0
  24. data/test/fixture/docx/one-page.docx +0 -0
  25. data/test/fixture/docx/special-characters.docx +0 -0
  26. data/test/fixture/odp/attributes.odp +0 -0
  27. data/test/fixture/odp/multi-slides.odp +0 -0
  28. data/test/fixture/odp/one-slide.odp +0 -0
  29. data/test/fixture/ods/attributes.ods +0 -0
  30. data/test/fixture/ods/multi-sheets.ods +0 -0
  31. data/test/fixture/ods/one-sheet.ods +0 -0
  32. data/test/fixture/odt/attributes.odt +0 -0
  33. data/test/fixture/odt/multi-pages.odt +0 -0
  34. data/test/fixture/odt/one-page.odt +0 -0
  35. data/test/fixture/odt/special-characters.odt +0 -0
  36. data/test/fixture/pptx/attributes.pptx +0 -0
  37. data/test/fixture/pptx/multi-slides.pptx +0 -0
  38. data/test/fixture/pptx/one-slide.pptx +0 -0
  39. data/test/fixture/xlsx/attributes.xlsx +0 -0
  40. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  41. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  42. metadata +36 -2
@@ -99,9 +99,15 @@ module ChupaText
99
99
  end
100
100
 
101
101
  def find_decomposer(data)
102
- @decomposers.find do |decomposer|
103
- decomposer.target?(data)
102
+ candidates = []
103
+ @decomposers.each do |decomposer|
104
+ score = decomposer.target_score(data)
105
+ next if score.nil?
106
+ candidates << [score, decomposer]
104
107
  end
108
+ return nil if candidates.empty?
109
+ candidate = candidates.sort_by {|score, _| score}.first
110
+ candidate[1]
105
111
  end
106
112
 
107
113
  def log_tag
@@ -22,9 +22,10 @@ require "chupa-text/formatters/hash"
22
22
  module ChupaText
23
23
  module Formatters
24
24
  class MIME < Hash
25
- def initialize(output)
25
+ def initialize(output, options={})
26
26
  super()
27
27
  @output = output
28
+ @boundary = options[:boundary]
28
29
  end
29
30
 
30
31
  def format_finish(data)
@@ -33,7 +34,7 @@ module ChupaText
33
34
  @output << "MIME-Version: 1.0\r\n"
34
35
  format_hash(formatted, ["texts"])
35
36
  texts = formatted["texts"]
36
- boundary = Digest::SHA1.hexdigest(data.uri.to_s)
37
+ boundary = @boundary || Digest::SHA1.hexdigest(data.uri.to_s)
37
38
  @output << "Content-Type: multipart/mixed; boundary=#{boundary}\r\n"
38
39
  texts.each do |text|
39
40
  @output << "\r\n--#{boundary}\r\n"
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.1.3"
18
+ VERSION = "1.1.4"
19
19
  end
@@ -0,0 +1,144 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposersOfficeOpenXMLDocument < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @decomposer = ChupaText::Decomposers::OfficeOpenXMLDocument.new({})
22
+ end
23
+
24
+ def decompose(path)
25
+ data = ChupaText::InputData.new(path)
26
+ decomposed = []
27
+ @decomposer.decompose(data) do |decomposed_data|
28
+ decomposed << decomposed_data
29
+ end
30
+ decomposed
31
+ end
32
+
33
+ sub_test_case("#target_score") do
34
+ def test_extension
35
+ data = ChupaText::Data.new
36
+ data.body = ""
37
+ data.uri = "document.docx"
38
+ assert_equal(-1, @decomposer.target_score(data))
39
+ end
40
+
41
+ def test_mime_type
42
+ data = ChupaText::Data.new
43
+ data.mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
44
+ assert_equal(-1, @decomposer.target_score(data))
45
+ end
46
+ end
47
+
48
+ sub_test_case("#decompose") do
49
+ sub_test_case("attributes") do
50
+ def decompose(attribute_name)
51
+ super(fixture_path("docx", "attributes.docx")).collect do |data|
52
+ data[attribute_name]
53
+ end
54
+ end
55
+
56
+ def test_title
57
+ assert_equal(["Title"], decompose("title"))
58
+ end
59
+
60
+ def test_author
61
+ assert_equal([nil], decompose("author"))
62
+ end
63
+
64
+ def test_subject
65
+ assert_equal(["Subject"], decompose("subject"))
66
+ end
67
+
68
+ def test_keywords
69
+ assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
70
+ end
71
+
72
+ def test_created_time
73
+ assert_equal([Time],
74
+ decompose("created_time").collect(&:class))
75
+ end
76
+
77
+ def test_modified_time
78
+ assert_equal([Time],
79
+ decompose("modified_time").collect(&:class))
80
+ end
81
+
82
+ def test_application
83
+ assert_equal(["LibreOffice"],
84
+ normalize_applications(decompose("application")))
85
+ end
86
+
87
+ def normalize_applications(applications)
88
+ applications.collect do |application|
89
+ normalize_application(application)
90
+ end
91
+ end
92
+
93
+ def normalize_application(application)
94
+ if application.start_with?("LibreOffice")
95
+ "LibreOffice"
96
+ else
97
+ application
98
+ end
99
+ end
100
+
101
+ def test_creation_date
102
+ assert_equal([nil], decompose("creation_date"))
103
+ end
104
+ end
105
+
106
+ sub_test_case("one page") do
107
+ def decompose
108
+ super(fixture_path("docx", "one-page.docx"))
109
+ end
110
+
111
+ def test_body
112
+ assert_equal(["Page1\n"], decompose.collect(&:body))
113
+ end
114
+ end
115
+
116
+ sub_test_case("multi pages") do
117
+ def decompose
118
+ super(fixture_path("docx", "multi-pages.docx"))
119
+ end
120
+
121
+ def test_body
122
+ assert_equal([<<-BODY], decompose.collect(&:body))
123
+ Page1
124
+ Page2
125
+ BODY
126
+ end
127
+ end
128
+
129
+ sub_test_case("special characters") do
130
+ def decompose
131
+ super(fixture_path("docx", "special-characters.docx"))
132
+ end
133
+
134
+ def test_body
135
+ assert_equal([<<-BODY], decompose.collect(&:body))
136
+ Ampersand: &
137
+ Reference: &amp;
138
+ HTML: <a href="">
139
+ Single quote: ''
140
+ BODY
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,133 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @decomposer = ChupaText::Decomposers::OfficeOpenXMLPresentation.new({})
22
+ end
23
+
24
+ def decompose(path)
25
+ data = ChupaText::InputData.new(path)
26
+ decomposed = []
27
+ @decomposer.decompose(data) do |decomposed_data|
28
+ decomposed << decomposed_data
29
+ end
30
+ decomposed
31
+ end
32
+
33
+ sub_test_case("#target_score") do
34
+ def test_extension
35
+ data = ChupaText::Data.new
36
+ data.body = ""
37
+ data.uri = "presentation.pptx"
38
+ assert_equal(-1, @decomposer.target_score(data))
39
+ end
40
+
41
+ def test_mime_type
42
+ data = ChupaText::Data.new
43
+ data.mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
44
+ assert_equal(-1, @decomposer.target_score(data))
45
+ end
46
+ end
47
+
48
+ sub_test_case("#decompose") do
49
+ sub_test_case("attributes") do
50
+ def decompose(attribute_name)
51
+ super(fixture_path("pptx", "attributes.pptx")).collect do |data|
52
+ data[attribute_name]
53
+ end
54
+ end
55
+
56
+ def test_title
57
+ assert_equal(["Title"], decompose("title"))
58
+ end
59
+
60
+ def test_author
61
+ assert_equal([nil], decompose("author"))
62
+ end
63
+
64
+ def test_subject
65
+ assert_equal(["Subject"], decompose("subject"))
66
+ end
67
+
68
+ def test_keywords
69
+ assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
70
+ end
71
+
72
+ def test_modified_time
73
+ assert_equal([Time],
74
+ decompose("modified_time").collect(&:class))
75
+ end
76
+
77
+ def test_application
78
+ assert_equal(["LibreOffice"],
79
+ normalize_applications(decompose("application")))
80
+ end
81
+
82
+ def normalize_applications(applications)
83
+ applications.collect do |application|
84
+ normalize_application(application)
85
+ end
86
+ end
87
+
88
+ def normalize_application(application)
89
+ if application.start_with?("LibreOffice")
90
+ "LibreOffice"
91
+ else
92
+ application
93
+ end
94
+ end
95
+
96
+ def test_creation_date
97
+ assert_equal([nil], decompose("creation_date"))
98
+ end
99
+ end
100
+
101
+ sub_test_case("one slide") do
102
+ def decompose
103
+ super(fixture_path("pptx", "one-slide.pptx"))
104
+ end
105
+
106
+ def test_body
107
+ assert_equal([<<-BODY], decompose.collect(&:body))
108
+ Slide1 title
109
+ Slide1 content
110
+ BODY
111
+ end
112
+ end
113
+
114
+ sub_test_case("multi slides") do
115
+ def decompose
116
+ super(fixture_path("pptx", "multi-slides.pptx"))
117
+ end
118
+
119
+ def test_body
120
+ assert_equal([<<-BODY], decompose.collect(&:body))
121
+ Slide1 title
122
+ Slide1 content
123
+
124
+ Slide2 title
125
+ Slide2 content
126
+
127
+ Slide3 title
128
+ Slide3 content
129
+ BODY
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,138 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @decomposer = ChupaText::Decomposers::OfficeOpenXMLWorkbook.new({})
22
+ end
23
+
24
+ def decompose(path)
25
+ data = ChupaText::InputData.new(path)
26
+ decomposed = []
27
+ @decomposer.decompose(data) do |decomposed_data|
28
+ decomposed << decomposed_data
29
+ end
30
+ decomposed
31
+ end
32
+
33
+ sub_test_case("#target_score") do
34
+ def test_extension
35
+ data = ChupaText::Data.new
36
+ data.body = ""
37
+ data.uri = "workbook.xlsx"
38
+ assert_equal(-1, @decomposer.target_score(data))
39
+ end
40
+
41
+ def test_mime_type
42
+ data = ChupaText::Data.new
43
+ data.mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
44
+ assert_equal(-1, @decomposer.target_score(data))
45
+ end
46
+ end
47
+
48
+ sub_test_case("#decompose") do
49
+ sub_test_case("attributes") do
50
+ def decompose(attribute_name)
51
+ super(fixture_path("xlsx", "attributes.xlsx")).collect do |data|
52
+ data[attribute_name]
53
+ end
54
+ end
55
+
56
+ def test_title
57
+ assert_equal(["Title"], decompose("title"))
58
+ end
59
+
60
+ def test_author
61
+ assert_equal([nil], decompose("author"))
62
+ end
63
+
64
+ def test_subject
65
+ assert_equal(["Subject"], decompose("subject"))
66
+ end
67
+
68
+ def test_keywords
69
+ assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
70
+ end
71
+
72
+ def test_created_time
73
+ assert_equal([Time],
74
+ decompose("created_time").collect(&:class))
75
+ end
76
+
77
+ def test_modified_time
78
+ assert_equal([Time],
79
+ decompose("modified_time").collect(&:class))
80
+ end
81
+
82
+ def test_application
83
+ assert_equal(["LibreOffice"],
84
+ normalize_applications(decompose("application")))
85
+ end
86
+
87
+ def normalize_applications(applications)
88
+ applications.collect do |application|
89
+ normalize_application(application)
90
+ end
91
+ end
92
+
93
+ def normalize_application(application)
94
+ if application.start_with?("LibreOffice")
95
+ "LibreOffice"
96
+ else
97
+ application
98
+ end
99
+ end
100
+
101
+ def test_creation_date
102
+ assert_equal([nil], decompose("creation_date"))
103
+ end
104
+ end
105
+
106
+ sub_test_case("one sheet") do
107
+ def decompose
108
+ super(fixture_path("xlsx", "one-sheet.xlsx"))
109
+ end
110
+
111
+ def test_body
112
+ assert_equal([<<-BODY], decompose.collect(&:body))
113
+ Sheet1 - A1\tSheet1 - B1
114
+ Sheet1 - A2\tSheet1 - B2
115
+ BODY
116
+ end
117
+ end
118
+
119
+ sub_test_case("multi sheets") do
120
+ def decompose
121
+ super(fixture_path("xlsx", "multi-sheets.xlsx"))
122
+ end
123
+
124
+ def test_body
125
+ assert_equal([<<-BODY], decompose.collect(&:body))
126
+ Sheet1 - A1\tSheet1 - B1
127
+ Sheet1 - A2\tSheet1 - B2
128
+
129
+ Sheet2 - A1\tSheet2 - B1
130
+ Sheet2 - A2\tSheet2 - B2
131
+
132
+ Sheet3 - A1\tSheet3 - B1
133
+ Sheet3 - A2\tSheet3 - B2
134
+ BODY
135
+ end
136
+ end
137
+ end
138
+ end