chupa-text 1.1.5 → 1.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/chupa-text.gemspec +2 -1
- data/doc/text/news.md +42 -0
- data/lib/chupa-text/data.rb +19 -2
- data/lib/chupa-text/decomposers/csv.rb +20 -4
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
- data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
- data/lib/chupa-text/decomposers/tar.rb +18 -12
- data/lib/chupa-text/decomposers/zip.rb +30 -4
- data/lib/chupa-text/extractor.rb +5 -3
- data/lib/chupa-text/path-converter.rb +70 -0
- data/lib/chupa-text/utf8-converter.rb +117 -0
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +4 -4
- data/test/decomposers/test-csv.rb +18 -3
- data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
- data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
- data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
- data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
- data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
- data/test/decomposers/test-tar.rb +18 -1
- data/test/decomposers/test-zip.rb +31 -1
- data/test/fixture/ods/covered-table-cell.ods +0 -0
- data/test/fixture/ods/shapes.ods +0 -0
- data/test/fixture/tar/utf-8.tar +0 -0
- data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
- data/test/fixture/zip/cp932.zip +0 -0
- data/test/fixture/zip/utf-8.zip +0 -0
- data/test/helper.rb +31 -1
- data/test/test-data.rb +7 -3
- data/test/test-extractor.rb +108 -1
- metadata +29 -7
@@ -48,46 +48,36 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
|
48
48
|
sub_test_case("#decompose") do
|
49
49
|
sub_test_case("attributes") do
|
50
50
|
def decompose(attribute_name)
|
51
|
-
super(fixture_path("xlsx", "attributes.xlsx")).
|
52
|
-
data[attribute_name]
|
53
|
-
end
|
51
|
+
super(fixture_path("xlsx", "attributes.xlsx")).first[attribute_name]
|
54
52
|
end
|
55
53
|
|
56
54
|
def test_title
|
57
|
-
assert_equal(
|
55
|
+
assert_equal("Title", decompose("title"))
|
58
56
|
end
|
59
57
|
|
60
58
|
def test_author
|
61
|
-
assert_equal(
|
59
|
+
assert_equal(nil, decompose("author"))
|
62
60
|
end
|
63
61
|
|
64
62
|
def test_subject
|
65
|
-
assert_equal(
|
63
|
+
assert_equal("Subject", decompose("subject"))
|
66
64
|
end
|
67
65
|
|
68
66
|
def test_keywords
|
69
|
-
assert_equal(
|
67
|
+
assert_equal("Keyword1 Keyword2", decompose("keywords"))
|
70
68
|
end
|
71
69
|
|
72
70
|
def test_created_time
|
73
|
-
assert_equal(
|
74
|
-
decompose("created_time").collect(&:class))
|
71
|
+
assert_equal(Time, decompose("created_time").class)
|
75
72
|
end
|
76
73
|
|
77
74
|
def test_modified_time
|
78
|
-
assert_equal(
|
79
|
-
decompose("modified_time").collect(&:class))
|
75
|
+
assert_equal(Time, decompose("modified_time").class)
|
80
76
|
end
|
81
77
|
|
82
78
|
def test_application
|
83
|
-
assert_equal(
|
84
|
-
|
85
|
-
end
|
86
|
-
|
87
|
-
def normalize_applications(applications)
|
88
|
-
applications.collect do |application|
|
89
|
-
normalize_application(application)
|
90
|
-
end
|
79
|
+
assert_equal("LibreOffice",
|
80
|
+
normalize_application(decompose("application")))
|
91
81
|
end
|
92
82
|
|
93
83
|
def normalize_application(application)
|
@@ -97,41 +87,69 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
|
97
87
|
application
|
98
88
|
end
|
99
89
|
end
|
100
|
-
|
101
|
-
def test_creation_date
|
102
|
-
assert_equal([nil], decompose("creation_date"))
|
103
|
-
end
|
104
90
|
end
|
105
91
|
|
106
|
-
sub_test_case("
|
107
|
-
def decompose
|
108
|
-
super(
|
92
|
+
sub_test_case("sheets") do
|
93
|
+
def decompose(path)
|
94
|
+
super(path).collect do |data|
|
95
|
+
[
|
96
|
+
data["index"],
|
97
|
+
data["name"],
|
98
|
+
data.body,
|
99
|
+
]
|
100
|
+
end
|
109
101
|
end
|
110
102
|
|
111
|
-
def
|
112
|
-
assert_equal([
|
113
|
-
|
114
|
-
|
115
|
-
|
103
|
+
def test_one_sheet
|
104
|
+
assert_equal([
|
105
|
+
[nil, nil, ""],
|
106
|
+
[
|
107
|
+
0,
|
108
|
+
"Sheet1",
|
109
|
+
"Sheet1 - A1\tSheet1 - B1\n" +
|
110
|
+
"Sheet1 - A2\tSheet1 - B2\n",
|
111
|
+
],
|
112
|
+
],
|
113
|
+
decompose(fixture_path("xlsx", "one-sheet.xlsx")))
|
116
114
|
end
|
117
|
-
end
|
118
115
|
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
def test_no_shared_cell
|
117
|
+
assert_equal([
|
118
|
+
[nil, nil, ""],
|
119
|
+
[
|
120
|
+
0,
|
121
|
+
"Sheet1",
|
122
|
+
"Sheet1 - A1\tSheet1 - B1\n" +
|
123
|
+
"Sheet1 - A2\tSheet1 - B2\n" +
|
124
|
+
"0.5\t0.5\n",
|
125
|
+
],
|
126
|
+
],
|
127
|
+
decompose(fixture_path("xlsx", "not-shared-cell.xlsx")))
|
122
128
|
end
|
123
129
|
|
124
|
-
def
|
125
|
-
assert_equal([
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
130
|
+
def test_multi_sheets
|
131
|
+
assert_equal([
|
132
|
+
[nil, nil, ""],
|
133
|
+
[
|
134
|
+
0,
|
135
|
+
"Sheet1",
|
136
|
+
"Sheet1 - A1\tSheet1 - B1\n" +
|
137
|
+
"Sheet1 - A2\tSheet1 - B2\n",
|
138
|
+
],
|
139
|
+
[
|
140
|
+
1,
|
141
|
+
"Sheet2",
|
142
|
+
"Sheet2 - A1\tSheet2 - B1\n" +
|
143
|
+
"Sheet2 - A2\tSheet2 - B2\n",
|
144
|
+
],
|
145
|
+
[
|
146
|
+
2,
|
147
|
+
"Sheet3",
|
148
|
+
"Sheet3 - A1\tSheet3 - B1\n" +
|
149
|
+
"Sheet3 - A2\tSheet3 - B2\n",
|
150
|
+
],
|
151
|
+
],
|
152
|
+
decompose(fixture_path("xlsx", "multi-sheets.xlsx")))
|
135
153
|
end
|
136
154
|
end
|
137
155
|
end
|
data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb}
RENAMED
File without changes
|
data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb}
RENAMED
@@ -87,9 +87,9 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
|
|
87
87
|
end
|
88
88
|
end
|
89
89
|
|
90
|
-
sub_test_case("
|
91
|
-
def decompose
|
92
|
-
super(
|
90
|
+
sub_test_case("sheets") do
|
91
|
+
def decompose(path)
|
92
|
+
super(path).collect do |data|
|
93
93
|
[
|
94
94
|
data["index"],
|
95
95
|
data["name"],
|
@@ -98,7 +98,7 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
101
|
-
def
|
101
|
+
def test_one_sheet
|
102
102
|
assert_equal([
|
103
103
|
[nil, nil, ""],
|
104
104
|
[
|
@@ -108,22 +108,10 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
|
|
108
108
|
"Sheet1 - A2\tSheet1 - B2\n",
|
109
109
|
],
|
110
110
|
],
|
111
|
-
decompose)
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
sub_test_case("multi sheets") do
|
116
|
-
def decompose
|
117
|
-
super(fixture_path("ods", "multi-sheets.ods")).collect do |data|
|
118
|
-
[
|
119
|
-
data["index"],
|
120
|
-
data["name"],
|
121
|
-
data.body,
|
122
|
-
]
|
123
|
-
end
|
111
|
+
decompose(fixture_path("ods", "one-sheet.ods")))
|
124
112
|
end
|
125
113
|
|
126
|
-
def
|
114
|
+
def test_multi_sheets
|
127
115
|
assert_equal([
|
128
116
|
[nil, nil, ""],
|
129
117
|
[
|
@@ -145,7 +133,35 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
|
|
145
133
|
"Sheet3 - A2\tSheet3 - B2\n",
|
146
134
|
],
|
147
135
|
],
|
148
|
-
decompose)
|
136
|
+
decompose(fixture_path("ods", "multi-sheets.ods")))
|
137
|
+
end
|
138
|
+
|
139
|
+
def test_covered_table_cell
|
140
|
+
assert_equal([
|
141
|
+
[nil, nil, ""],
|
142
|
+
[
|
143
|
+
0,
|
144
|
+
"Sheet1",
|
145
|
+
"Covered-table-cell\t\n",
|
146
|
+
],
|
147
|
+
],
|
148
|
+
decompose(fixture_path("ods", "covered-table-cell.ods")))
|
149
|
+
end
|
150
|
+
|
151
|
+
def test_shapes
|
152
|
+
assert_equal([
|
153
|
+
[nil, nil, ""],
|
154
|
+
[
|
155
|
+
0,
|
156
|
+
"Sheet1",
|
157
|
+
"Shape1\n" +
|
158
|
+
"Shape1\n" +
|
159
|
+
"Shape2\n" +
|
160
|
+
"Shape2\n" +
|
161
|
+
"Cell\n",
|
162
|
+
],
|
163
|
+
],
|
164
|
+
decompose(fixture_path("ods", "shapes.ods")))
|
149
165
|
end
|
150
166
|
end
|
151
167
|
end
|
File without changes
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -70,5 +70,22 @@ class TestDecomposersTar < Test::Unit::TestCase
|
|
70
70
|
decompose(data))
|
71
71
|
end
|
72
72
|
end
|
73
|
+
|
74
|
+
sub_test_case("multibyte") do
|
75
|
+
test("UTF-8") do
|
76
|
+
data_path = Pathname.new(fixture_path("utf-8.tar"))
|
77
|
+
base_path = data_path.sub_ext("")
|
78
|
+
data = ChupaText::InputData.new(data_path)
|
79
|
+
path = CGI.escape("こんにちは.txt")
|
80
|
+
assert_equal([
|
81
|
+
{
|
82
|
+
:uri => file_uri("#{base_path}/utf-8/#{path}").to_s,
|
83
|
+
:body => "こんにちは\n".b,
|
84
|
+
:source => data.uri.to_s,
|
85
|
+
},
|
86
|
+
],
|
87
|
+
decompose(data))
|
88
|
+
end
|
89
|
+
end
|
73
90
|
end
|
74
91
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2017-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -63,6 +63,36 @@ class TestDecomposersZip < Test::Unit::TestCase
|
|
63
63
|
decompose(data_path))
|
64
64
|
end
|
65
65
|
|
66
|
+
sub_test_case("multibyte") do
|
67
|
+
test("cp932") do
|
68
|
+
data_path = Pathname.new(fixture_path("cp932.zip"))
|
69
|
+
base_path = data_path.sub_ext("")
|
70
|
+
path = CGI.escape("こんにちは.txt")
|
71
|
+
assert_equal([
|
72
|
+
{
|
73
|
+
:uri => file_uri("#{base_path}/cp932/#{path}").to_s,
|
74
|
+
:body => "こんにちは\n".encode("cp932").b,
|
75
|
+
:source => file_uri(data_path).to_s,
|
76
|
+
},
|
77
|
+
],
|
78
|
+
decompose(data_path))
|
79
|
+
end
|
80
|
+
|
81
|
+
test("UTF-8") do
|
82
|
+
data_path = Pathname.new(fixture_path("utf-8.zip"))
|
83
|
+
base_path = data_path.sub_ext("")
|
84
|
+
path = CGI.escape("こんにちは.txt")
|
85
|
+
assert_equal([
|
86
|
+
{
|
87
|
+
:uri => file_uri("#{base_path}/utf-8/#{path}").to_s,
|
88
|
+
:body => "こんにちは\n".b,
|
89
|
+
:source => file_uri(data_path).to_s,
|
90
|
+
},
|
91
|
+
],
|
92
|
+
decompose(data_path))
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
66
96
|
sub_test_case("encrypted") do
|
67
97
|
test("without password") do
|
68
98
|
data_path = Pathname.new(fixture_path("password.zip"))
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/helper.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -32,4 +32,34 @@ module Helper
|
|
32
32
|
def file_uri(path)
|
33
33
|
URI.parse("file://#{path}")
|
34
34
|
end
|
35
|
+
|
36
|
+
|
37
|
+
class CaptureLogger
|
38
|
+
def initialize(output)
|
39
|
+
@output = output
|
40
|
+
end
|
41
|
+
|
42
|
+
def error(message=nil)
|
43
|
+
@output << [:error, message || yield]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def capture_log
|
48
|
+
original_logger = ChupaText.logger
|
49
|
+
begin
|
50
|
+
output = []
|
51
|
+
ChupaText.logger = CaptureLogger.new(output)
|
52
|
+
yield
|
53
|
+
normalize_log(output)
|
54
|
+
ensure
|
55
|
+
ChupaText.logger = original_logger
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def normalize_log(log)
|
60
|
+
log.collect do |level, message|
|
61
|
+
message = message.split("\n", 2)[0]
|
62
|
+
[level, message]
|
63
|
+
end
|
64
|
+
end
|
35
65
|
end
|
data/test/test-data.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -44,11 +44,15 @@ class TestData < Test::Unit::TestCase
|
|
44
44
|
|
45
45
|
sub_test_case("body") do
|
46
46
|
def test_txt
|
47
|
-
body = "Hello"
|
48
|
-
body.force_encoding("ASCII-8BIT")
|
47
|
+
body = "Hello".b
|
49
48
|
assert_equal("text/plain", guess(body))
|
50
49
|
end
|
51
50
|
|
51
|
+
def test_utf8_valid_binary
|
52
|
+
body = "GROONGA:IO:00001@\0\0\0\0\0\0\0\0\0\0".b
|
53
|
+
assert_nil(guess(body))
|
54
|
+
end
|
55
|
+
|
52
56
|
private
|
53
57
|
def guess(body)
|
54
58
|
@data.body = body
|
data/test/test-extractor.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -121,5 +121,112 @@ class TestExtractor < Test::Unit::TestCase
|
|
121
121
|
assert_equal(["Hello", "Hello"], extract(data))
|
122
122
|
end
|
123
123
|
end
|
124
|
+
|
125
|
+
sub_test_case("body") do
|
126
|
+
def test_utf8
|
127
|
+
data = ChupaText::Data.new
|
128
|
+
data.mime_type = "text/plain"
|
129
|
+
data.body = "こんにちは"
|
130
|
+
assert_equal(["こんにちは"], extract(data))
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_utf8_ascii_8bit
|
134
|
+
data = ChupaText::Data.new
|
135
|
+
data.mime_type = "text/plain"
|
136
|
+
data.body = "こんにちは".b
|
137
|
+
assert_equal(["こんにちは"], extract(data))
|
138
|
+
end
|
139
|
+
|
140
|
+
def test_utf8_broken
|
141
|
+
data = ChupaText::Data.new
|
142
|
+
data.mime_type = "text/plain"
|
143
|
+
data.body = "\x82\x00こんにちは".b
|
144
|
+
assert_equal(["こんにちは"], extract(data))
|
145
|
+
end
|
146
|
+
|
147
|
+
def test_utf16_le
|
148
|
+
data = ChupaText::Data.new
|
149
|
+
data.mime_type = "text/plain"
|
150
|
+
data.body = "こんにちは".encode("UTF-16LE")
|
151
|
+
assert_equal(["こんにちは"], extract(data))
|
152
|
+
end
|
153
|
+
|
154
|
+
def test_utf16_le_ascii_8bit
|
155
|
+
data = ChupaText::Data.new
|
156
|
+
data.mime_type = "text/plain"
|
157
|
+
data.body = "\ufeffこんにちは".encode("UTF-16LE").b
|
158
|
+
assert_equal(["こんにちは"], extract(data))
|
159
|
+
end
|
160
|
+
|
161
|
+
def test_utf16_be
|
162
|
+
data = ChupaText::Data.new
|
163
|
+
data.mime_type = "text/plain"
|
164
|
+
data.body = "こんにちは".encode("UTF-16BE")
|
165
|
+
assert_equal(["こんにちは"], extract(data))
|
166
|
+
end
|
167
|
+
|
168
|
+
def test_utf16_be_ascii_8bit
|
169
|
+
data = ChupaText::Data.new
|
170
|
+
data.mime_type = "text/plain"
|
171
|
+
data.body = "\ufeffこんにちは".encode("UTF-16BE").b
|
172
|
+
assert_equal(["こんにちは"], extract(data))
|
173
|
+
end
|
174
|
+
|
175
|
+
def test_utf32_le
|
176
|
+
data = ChupaText::Data.new
|
177
|
+
data.mime_type = "text/plain"
|
178
|
+
data.body = "こんにちは".encode("UTF-32LE")
|
179
|
+
assert_equal(["こんにちは"], extract(data))
|
180
|
+
end
|
181
|
+
|
182
|
+
def test_utf32_le_ascii_8bit
|
183
|
+
data = ChupaText::Data.new
|
184
|
+
data.mime_type = "text/plain"
|
185
|
+
data.body = "\ufeffこんにちは".encode("UTF-32LE").b
|
186
|
+
assert_equal(["こんにちは"], extract(data))
|
187
|
+
end
|
188
|
+
|
189
|
+
def test_utf32_be
|
190
|
+
data = ChupaText::Data.new
|
191
|
+
data.mime_type = "text/plain"
|
192
|
+
data.body = "こんにちは".encode("UTF-32BE")
|
193
|
+
assert_equal(["こんにちは"], extract(data))
|
194
|
+
end
|
195
|
+
|
196
|
+
def test_utf32_be_ascii_8bit
|
197
|
+
data = ChupaText::Data.new
|
198
|
+
data.mime_type = "text/plain"
|
199
|
+
data.body = "\ufeffこんにちは".encode("UTF-32BE").b
|
200
|
+
assert_equal(["こんにちは"], extract(data))
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_cp932
|
204
|
+
data = ChupaText::Data.new
|
205
|
+
data.mime_type = "text/plain"
|
206
|
+
data.body = "こんにちは".encode("cp932")
|
207
|
+
assert_equal(["こんにちは"], extract(data))
|
208
|
+
end
|
209
|
+
|
210
|
+
def test_cp932_ascii_8bit
|
211
|
+
data = ChupaText::Data.new
|
212
|
+
data.mime_type = "text/plain"
|
213
|
+
data.body = "こんにちは".encode("cp932").b
|
214
|
+
assert_equal(["こんにちは"], extract(data))
|
215
|
+
end
|
216
|
+
|
217
|
+
def test_euc_jp
|
218
|
+
data = ChupaText::Data.new
|
219
|
+
data.mime_type = "text/plain"
|
220
|
+
data.body = "こんにちは".encode("euc-jp")
|
221
|
+
assert_equal(["こんにちは"], extract(data))
|
222
|
+
end
|
223
|
+
|
224
|
+
def test_euc_jp_ascii_8bit
|
225
|
+
data = ChupaText::Data.new
|
226
|
+
data.mime_type = "text/plain"
|
227
|
+
data.body = "こんにちは".encode("euc-jp").b
|
228
|
+
assert_equal(["こんにちは"], extract(data))
|
229
|
+
end
|
230
|
+
end
|
124
231
|
end
|
125
232
|
end
|