chupa-text 1.1.3 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +12 -0
- data/lib/chupa-text/command/chupa-text.rb +7 -1
- data/lib/chupa-text/decomposer.rb +8 -0
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
- data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
- data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
- data/lib/chupa-text/decomposers/opendocument.rb +139 -0
- data/lib/chupa-text/extractor.rb +8 -2
- data/lib/chupa-text/formatters/mime.rb +3 -2
- data/lib/chupa-text/version.rb +1 -1
- data/test/decomposers/test-office-open-xml-document.rb +144 -0
- data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
- data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
- data/test/decomposers/test-open-document-presentation.rb +136 -0
- data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
- data/test/decomposers/test-open-document-text.rb +144 -0
- data/test/fixture/docx/attributes.docx +0 -0
- data/test/fixture/docx/multi-pages.docx +0 -0
- data/test/fixture/docx/one-page.docx +0 -0
- data/test/fixture/docx/special-characters.docx +0 -0
- data/test/fixture/odp/attributes.odp +0 -0
- data/test/fixture/odp/multi-slides.odp +0 -0
- data/test/fixture/odp/one-slide.odp +0 -0
- data/test/fixture/ods/attributes.ods +0 -0
- data/test/fixture/ods/multi-sheets.ods +0 -0
- data/test/fixture/ods/one-sheet.ods +0 -0
- data/test/fixture/odt/attributes.odt +0 -0
- data/test/fixture/odt/multi-pages.odt +0 -0
- data/test/fixture/odt/one-page.odt +0 -0
- data/test/fixture/odt/special-characters.odt +0 -0
- data/test/fixture/pptx/attributes.pptx +0 -0
- data/test/fixture/pptx/multi-slides.pptx +0 -0
- data/test/fixture/pptx/one-slide.pptx +0 -0
- data/test/fixture/xlsx/attributes.xlsx +0 -0
- data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
- data/test/fixture/xlsx/one-sheet.xlsx +0 -0
- metadata +36 -2
data/lib/chupa-text/extractor.rb
CHANGED
@@ -99,9 +99,15 @@ module ChupaText
|
|
99
99
|
end
|
100
100
|
|
101
101
|
def find_decomposer(data)
|
102
|
-
|
103
|
-
|
102
|
+
candidates = []
|
103
|
+
@decomposers.each do |decomposer|
|
104
|
+
score = decomposer.target_score(data)
|
105
|
+
next if score.nil?
|
106
|
+
candidates << [score, decomposer]
|
104
107
|
end
|
108
|
+
return nil if candidates.empty?
|
109
|
+
candidate = candidates.sort_by {|score, _| score}.first
|
110
|
+
candidate[1]
|
105
111
|
end
|
106
112
|
|
107
113
|
def log_tag
|
@@ -22,9 +22,10 @@ require "chupa-text/formatters/hash"
|
|
22
22
|
module ChupaText
|
23
23
|
module Formatters
|
24
24
|
class MIME < Hash
|
25
|
-
def initialize(output)
|
25
|
+
def initialize(output, options={})
|
26
26
|
super()
|
27
27
|
@output = output
|
28
|
+
@boundary = options[:boundary]
|
28
29
|
end
|
29
30
|
|
30
31
|
def format_finish(data)
|
@@ -33,7 +34,7 @@ module ChupaText
|
|
33
34
|
@output << "MIME-Version: 1.0\r\n"
|
34
35
|
format_hash(formatted, ["texts"])
|
35
36
|
texts = formatted["texts"]
|
36
|
-
boundary = Digest::SHA1.hexdigest(data.uri.to_s)
|
37
|
+
boundary = @boundary || Digest::SHA1.hexdigest(data.uri.to_s)
|
37
38
|
@output << "Content-Type: multipart/mixed; boundary=#{boundary}\r\n"
|
38
39
|
texts.each do |text|
|
39
40
|
@output << "\r\n--#{boundary}\r\n"
|
data/lib/chupa-text/version.rb
CHANGED
@@ -0,0 +1,144 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
class TestDecomposersOfficeOpenXMLDocument < Test::Unit::TestCase
|
18
|
+
include Helper
|
19
|
+
|
20
|
+
def setup
|
21
|
+
@decomposer = ChupaText::Decomposers::OfficeOpenXMLDocument.new({})
|
22
|
+
end
|
23
|
+
|
24
|
+
def decompose(path)
|
25
|
+
data = ChupaText::InputData.new(path)
|
26
|
+
decomposed = []
|
27
|
+
@decomposer.decompose(data) do |decomposed_data|
|
28
|
+
decomposed << decomposed_data
|
29
|
+
end
|
30
|
+
decomposed
|
31
|
+
end
|
32
|
+
|
33
|
+
sub_test_case("#target_score") do
|
34
|
+
def test_extension
|
35
|
+
data = ChupaText::Data.new
|
36
|
+
data.body = ""
|
37
|
+
data.uri = "document.docx"
|
38
|
+
assert_equal(-1, @decomposer.target_score(data))
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_mime_type
|
42
|
+
data = ChupaText::Data.new
|
43
|
+
data.mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
44
|
+
assert_equal(-1, @decomposer.target_score(data))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
sub_test_case("#decompose") do
|
49
|
+
sub_test_case("attributes") do
|
50
|
+
def decompose(attribute_name)
|
51
|
+
super(fixture_path("docx", "attributes.docx")).collect do |data|
|
52
|
+
data[attribute_name]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_title
|
57
|
+
assert_equal(["Title"], decompose("title"))
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_author
|
61
|
+
assert_equal([nil], decompose("author"))
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_subject
|
65
|
+
assert_equal(["Subject"], decompose("subject"))
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_keywords
|
69
|
+
assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_created_time
|
73
|
+
assert_equal([Time],
|
74
|
+
decompose("created_time").collect(&:class))
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_modified_time
|
78
|
+
assert_equal([Time],
|
79
|
+
decompose("modified_time").collect(&:class))
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_application
|
83
|
+
assert_equal(["LibreOffice"],
|
84
|
+
normalize_applications(decompose("application")))
|
85
|
+
end
|
86
|
+
|
87
|
+
def normalize_applications(applications)
|
88
|
+
applications.collect do |application|
|
89
|
+
normalize_application(application)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def normalize_application(application)
|
94
|
+
if application.start_with?("LibreOffice")
|
95
|
+
"LibreOffice"
|
96
|
+
else
|
97
|
+
application
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_creation_date
|
102
|
+
assert_equal([nil], decompose("creation_date"))
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
sub_test_case("one page") do
|
107
|
+
def decompose
|
108
|
+
super(fixture_path("docx", "one-page.docx"))
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_body
|
112
|
+
assert_equal(["Page1\n"], decompose.collect(&:body))
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
sub_test_case("multi pages") do
|
117
|
+
def decompose
|
118
|
+
super(fixture_path("docx", "multi-pages.docx"))
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_body
|
122
|
+
assert_equal([<<-BODY], decompose.collect(&:body))
|
123
|
+
Page1
|
124
|
+
Page2
|
125
|
+
BODY
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
sub_test_case("special characters") do
|
130
|
+
def decompose
|
131
|
+
super(fixture_path("docx", "special-characters.docx"))
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_body
|
135
|
+
assert_equal([<<-BODY], decompose.collect(&:body))
|
136
|
+
Ampersand: &
|
137
|
+
Reference: &
|
138
|
+
HTML: <a href="">
|
139
|
+
Single quote: ''
|
140
|
+
BODY
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
|
18
|
+
include Helper
|
19
|
+
|
20
|
+
def setup
|
21
|
+
@decomposer = ChupaText::Decomposers::OfficeOpenXMLPresentation.new({})
|
22
|
+
end
|
23
|
+
|
24
|
+
def decompose(path)
|
25
|
+
data = ChupaText::InputData.new(path)
|
26
|
+
decomposed = []
|
27
|
+
@decomposer.decompose(data) do |decomposed_data|
|
28
|
+
decomposed << decomposed_data
|
29
|
+
end
|
30
|
+
decomposed
|
31
|
+
end
|
32
|
+
|
33
|
+
sub_test_case("#target_score") do
|
34
|
+
def test_extension
|
35
|
+
data = ChupaText::Data.new
|
36
|
+
data.body = ""
|
37
|
+
data.uri = "presentation.pptx"
|
38
|
+
assert_equal(-1, @decomposer.target_score(data))
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_mime_type
|
42
|
+
data = ChupaText::Data.new
|
43
|
+
data.mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
44
|
+
assert_equal(-1, @decomposer.target_score(data))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
sub_test_case("#decompose") do
|
49
|
+
sub_test_case("attributes") do
|
50
|
+
def decompose(attribute_name)
|
51
|
+
super(fixture_path("pptx", "attributes.pptx")).collect do |data|
|
52
|
+
data[attribute_name]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_title
|
57
|
+
assert_equal(["Title"], decompose("title"))
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_author
|
61
|
+
assert_equal([nil], decompose("author"))
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_subject
|
65
|
+
assert_equal(["Subject"], decompose("subject"))
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_keywords
|
69
|
+
assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_modified_time
|
73
|
+
assert_equal([Time],
|
74
|
+
decompose("modified_time").collect(&:class))
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_application
|
78
|
+
assert_equal(["LibreOffice"],
|
79
|
+
normalize_applications(decompose("application")))
|
80
|
+
end
|
81
|
+
|
82
|
+
def normalize_applications(applications)
|
83
|
+
applications.collect do |application|
|
84
|
+
normalize_application(application)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def normalize_application(application)
|
89
|
+
if application.start_with?("LibreOffice")
|
90
|
+
"LibreOffice"
|
91
|
+
else
|
92
|
+
application
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_creation_date
|
97
|
+
assert_equal([nil], decompose("creation_date"))
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
sub_test_case("one slide") do
|
102
|
+
def decompose
|
103
|
+
super(fixture_path("pptx", "one-slide.pptx"))
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_body
|
107
|
+
assert_equal([<<-BODY], decompose.collect(&:body))
|
108
|
+
Slide1 title
|
109
|
+
Slide1 content
|
110
|
+
BODY
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
sub_test_case("multi slides") do
|
115
|
+
def decompose
|
116
|
+
super(fixture_path("pptx", "multi-slides.pptx"))
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_body
|
120
|
+
assert_equal([<<-BODY], decompose.collect(&:body))
|
121
|
+
Slide1 title
|
122
|
+
Slide1 content
|
123
|
+
|
124
|
+
Slide2 title
|
125
|
+
Slide2 content
|
126
|
+
|
127
|
+
Slide3 title
|
128
|
+
Slide3 content
|
129
|
+
BODY
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
18
|
+
include Helper
|
19
|
+
|
20
|
+
def setup
|
21
|
+
@decomposer = ChupaText::Decomposers::OfficeOpenXMLWorkbook.new({})
|
22
|
+
end
|
23
|
+
|
24
|
+
def decompose(path)
|
25
|
+
data = ChupaText::InputData.new(path)
|
26
|
+
decomposed = []
|
27
|
+
@decomposer.decompose(data) do |decomposed_data|
|
28
|
+
decomposed << decomposed_data
|
29
|
+
end
|
30
|
+
decomposed
|
31
|
+
end
|
32
|
+
|
33
|
+
sub_test_case("#target_score") do
|
34
|
+
def test_extension
|
35
|
+
data = ChupaText::Data.new
|
36
|
+
data.body = ""
|
37
|
+
data.uri = "workbook.xlsx"
|
38
|
+
assert_equal(-1, @decomposer.target_score(data))
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_mime_type
|
42
|
+
data = ChupaText::Data.new
|
43
|
+
data.mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
44
|
+
assert_equal(-1, @decomposer.target_score(data))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
sub_test_case("#decompose") do
|
49
|
+
sub_test_case("attributes") do
|
50
|
+
def decompose(attribute_name)
|
51
|
+
super(fixture_path("xlsx", "attributes.xlsx")).collect do |data|
|
52
|
+
data[attribute_name]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_title
|
57
|
+
assert_equal(["Title"], decompose("title"))
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_author
|
61
|
+
assert_equal([nil], decompose("author"))
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_subject
|
65
|
+
assert_equal(["Subject"], decompose("subject"))
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_keywords
|
69
|
+
assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_created_time
|
73
|
+
assert_equal([Time],
|
74
|
+
decompose("created_time").collect(&:class))
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_modified_time
|
78
|
+
assert_equal([Time],
|
79
|
+
decompose("modified_time").collect(&:class))
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_application
|
83
|
+
assert_equal(["LibreOffice"],
|
84
|
+
normalize_applications(decompose("application")))
|
85
|
+
end
|
86
|
+
|
87
|
+
def normalize_applications(applications)
|
88
|
+
applications.collect do |application|
|
89
|
+
normalize_application(application)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def normalize_application(application)
|
94
|
+
if application.start_with?("LibreOffice")
|
95
|
+
"LibreOffice"
|
96
|
+
else
|
97
|
+
application
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_creation_date
|
102
|
+
assert_equal([nil], decompose("creation_date"))
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
sub_test_case("one sheet") do
|
107
|
+
def decompose
|
108
|
+
super(fixture_path("xlsx", "one-sheet.xlsx"))
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_body
|
112
|
+
assert_equal([<<-BODY], decompose.collect(&:body))
|
113
|
+
Sheet1 - A1\tSheet1 - B1
|
114
|
+
Sheet1 - A2\tSheet1 - B2
|
115
|
+
BODY
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
sub_test_case("multi sheets") do
|
120
|
+
def decompose
|
121
|
+
super(fixture_path("xlsx", "multi-sheets.xlsx"))
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_body
|
125
|
+
assert_equal([<<-BODY], decompose.collect(&:body))
|
126
|
+
Sheet1 - A1\tSheet1 - B1
|
127
|
+
Sheet1 - A2\tSheet1 - B2
|
128
|
+
|
129
|
+
Sheet2 - A1\tSheet2 - B1
|
130
|
+
Sheet2 - A2\tSheet2 - B2
|
131
|
+
|
132
|
+
Sheet3 - A1\tSheet3 - B1
|
133
|
+
Sheet3 - A2\tSheet3 - B2
|
134
|
+
BODY
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|