chupa-text 1.1.5 → 1.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/chupa-text.gemspec +2 -1
  3. data/doc/text/news.md +42 -0
  4. data/lib/chupa-text/data.rb +19 -2
  5. data/lib/chupa-text/decomposers/csv.rb +20 -4
  6. data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
  8. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
  9. data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
  11. data/lib/chupa-text/decomposers/tar.rb +18 -12
  12. data/lib/chupa-text/decomposers/zip.rb +30 -4
  13. data/lib/chupa-text/extractor.rb +5 -3
  14. data/lib/chupa-text/path-converter.rb +70 -0
  15. data/lib/chupa-text/utf8-converter.rb +117 -0
  16. data/lib/chupa-text/version.rb +1 -1
  17. data/test/command/test-chupa-text.rb +4 -4
  18. data/test/decomposers/test-csv.rb +18 -3
  19. data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
  20. data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
  21. data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
  22. data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
  23. data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
  24. data/test/decomposers/test-tar.rb +18 -1
  25. data/test/decomposers/test-zip.rb +31 -1
  26. data/test/fixture/ods/covered-table-cell.ods +0 -0
  27. data/test/fixture/ods/shapes.ods +0 -0
  28. data/test/fixture/tar/utf-8.tar +0 -0
  29. data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
  30. data/test/fixture/zip/cp932.zip +0 -0
  31. data/test/fixture/zip/utf-8.zip +0 -0
  32. data/test/helper.rb +31 -1
  33. data/test/test-data.rb +7 -3
  34. data/test/test-extractor.rb +108 -1
  35. metadata +29 -7
@@ -48,46 +48,36 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
48
48
  sub_test_case("#decompose") do
49
49
  sub_test_case("attributes") do
50
50
  def decompose(attribute_name)
51
- super(fixture_path("xlsx", "attributes.xlsx")).collect do |data|
52
- data[attribute_name]
53
- end
51
+ super(fixture_path("xlsx", "attributes.xlsx")).first[attribute_name]
54
52
  end
55
53
 
56
54
  def test_title
57
- assert_equal(["Title"], decompose("title"))
55
+ assert_equal("Title", decompose("title"))
58
56
  end
59
57
 
60
58
  def test_author
61
- assert_equal([nil], decompose("author"))
59
+ assert_equal(nil, decompose("author"))
62
60
  end
63
61
 
64
62
  def test_subject
65
- assert_equal(["Subject"], decompose("subject"))
63
+ assert_equal("Subject", decompose("subject"))
66
64
  end
67
65
 
68
66
  def test_keywords
69
- assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
67
+ assert_equal("Keyword1 Keyword2", decompose("keywords"))
70
68
  end
71
69
 
72
70
  def test_created_time
73
- assert_equal([Time],
74
- decompose("created_time").collect(&:class))
71
+ assert_equal(Time, decompose("created_time").class)
75
72
  end
76
73
 
77
74
  def test_modified_time
78
- assert_equal([Time],
79
- decompose("modified_time").collect(&:class))
75
+ assert_equal(Time, decompose("modified_time").class)
80
76
  end
81
77
 
82
78
  def test_application
83
- assert_equal(["LibreOffice"],
84
- normalize_applications(decompose("application")))
85
- end
86
-
87
- def normalize_applications(applications)
88
- applications.collect do |application|
89
- normalize_application(application)
90
- end
79
+ assert_equal("LibreOffice",
80
+ normalize_application(decompose("application")))
91
81
  end
92
82
 
93
83
  def normalize_application(application)
@@ -97,41 +87,69 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
97
87
  application
98
88
  end
99
89
  end
100
-
101
- def test_creation_date
102
- assert_equal([nil], decompose("creation_date"))
103
- end
104
90
  end
105
91
 
106
- sub_test_case("one sheet") do
107
- def decompose
108
- super(fixture_path("xlsx", "one-sheet.xlsx"))
92
+ sub_test_case("sheets") do
93
+ def decompose(path)
94
+ super(path).collect do |data|
95
+ [
96
+ data["index"],
97
+ data["name"],
98
+ data.body,
99
+ ]
100
+ end
109
101
  end
110
102
 
111
- def test_body
112
- assert_equal([<<-BODY], decompose.collect(&:body))
113
- Sheet1 - A1\tSheet1 - B1
114
- Sheet1 - A2\tSheet1 - B2
115
- BODY
103
+ def test_one_sheet
104
+ assert_equal([
105
+ [nil, nil, ""],
106
+ [
107
+ 0,
108
+ "Sheet1",
109
+ "Sheet1 - A1\tSheet1 - B1\n" +
110
+ "Sheet1 - A2\tSheet1 - B2\n",
111
+ ],
112
+ ],
113
+ decompose(fixture_path("xlsx", "one-sheet.xlsx")))
116
114
  end
117
- end
118
115
 
119
- sub_test_case("multi sheets") do
120
- def decompose
121
- super(fixture_path("xlsx", "multi-sheets.xlsx"))
116
+ def test_no_shared_cell
117
+ assert_equal([
118
+ [nil, nil, ""],
119
+ [
120
+ 0,
121
+ "Sheet1",
122
+ "Sheet1 - A1\tSheet1 - B1\n" +
123
+ "Sheet1 - A2\tSheet1 - B2\n" +
124
+ "0.5\t0.5\n",
125
+ ],
126
+ ],
127
+ decompose(fixture_path("xlsx", "not-shared-cell.xlsx")))
122
128
  end
123
129
 
124
- def test_body
125
- assert_equal([<<-BODY], decompose.collect(&:body))
126
- Sheet1 - A1\tSheet1 - B1
127
- Sheet1 - A2\tSheet1 - B2
128
-
129
- Sheet2 - A1\tSheet2 - B1
130
- Sheet2 - A2\tSheet2 - B2
131
-
132
- Sheet3 - A1\tSheet3 - B1
133
- Sheet3 - A2\tSheet3 - B2
134
- BODY
130
+ def test_multi_sheets
131
+ assert_equal([
132
+ [nil, nil, ""],
133
+ [
134
+ 0,
135
+ "Sheet1",
136
+ "Sheet1 - A1\tSheet1 - B1\n" +
137
+ "Sheet1 - A2\tSheet1 - B2\n",
138
+ ],
139
+ [
140
+ 1,
141
+ "Sheet2",
142
+ "Sheet2 - A1\tSheet2 - B1\n" +
143
+ "Sheet2 - A2\tSheet2 - B2\n",
144
+ ],
145
+ [
146
+ 2,
147
+ "Sheet3",
148
+ "Sheet3 - A1\tSheet3 - B1\n" +
149
+ "Sheet3 - A2\tSheet3 - B2\n",
150
+ ],
151
+ ],
152
+ decompose(fixture_path("xlsx", "multi-sheets.xlsx")))
135
153
  end
136
154
  end
137
155
  end
@@ -87,9 +87,9 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
87
87
  end
88
88
  end
89
89
 
90
- sub_test_case("one sheet") do
91
- def decompose
92
- super(fixture_path("ods", "one-sheet.ods")).collect do |data|
90
+ sub_test_case("sheets") do
91
+ def decompose(path)
92
+ super(path).collect do |data|
93
93
  [
94
94
  data["index"],
95
95
  data["name"],
@@ -98,7 +98,7 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
98
98
  end
99
99
  end
100
100
 
101
- def test_body
101
+ def test_one_sheet
102
102
  assert_equal([
103
103
  [nil, nil, ""],
104
104
  [
@@ -108,22 +108,10 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
108
108
  "Sheet1 - A2\tSheet1 - B2\n",
109
109
  ],
110
110
  ],
111
- decompose)
112
- end
113
- end
114
-
115
- sub_test_case("multi sheets") do
116
- def decompose
117
- super(fixture_path("ods", "multi-sheets.ods")).collect do |data|
118
- [
119
- data["index"],
120
- data["name"],
121
- data.body,
122
- ]
123
- end
111
+ decompose(fixture_path("ods", "one-sheet.ods")))
124
112
  end
125
113
 
126
- def test_body
114
+ def test_multi_sheets
127
115
  assert_equal([
128
116
  [nil, nil, ""],
129
117
  [
@@ -145,7 +133,35 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
145
133
  "Sheet3 - A2\tSheet3 - B2\n",
146
134
  ],
147
135
  ],
148
- decompose)
136
+ decompose(fixture_path("ods", "multi-sheets.ods")))
137
+ end
138
+
139
+ def test_covered_table_cell
140
+ assert_equal([
141
+ [nil, nil, ""],
142
+ [
143
+ 0,
144
+ "Sheet1",
145
+ "Covered-table-cell\t\n",
146
+ ],
147
+ ],
148
+ decompose(fixture_path("ods", "covered-table-cell.ods")))
149
+ end
150
+
151
+ def test_shapes
152
+ assert_equal([
153
+ [nil, nil, ""],
154
+ [
155
+ 0,
156
+ "Sheet1",
157
+ "Shape1\n" +
158
+ "Shape1\n" +
159
+ "Shape2\n" +
160
+ "Shape2\n" +
161
+ "Cell\n",
162
+ ],
163
+ ],
164
+ decompose(fixture_path("ods", "shapes.ods")))
149
165
  end
150
166
  end
151
167
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -70,5 +70,22 @@ class TestDecomposersTar < Test::Unit::TestCase
70
70
  decompose(data))
71
71
  end
72
72
  end
73
+
74
+ sub_test_case("multibyte") do
75
+ test("UTF-8") do
76
+ data_path = Pathname.new(fixture_path("utf-8.tar"))
77
+ base_path = data_path.sub_ext("")
78
+ data = ChupaText::InputData.new(data_path)
79
+ path = CGI.escape("こんにちは.txt")
80
+ assert_equal([
81
+ {
82
+ :uri => file_uri("#{base_path}/utf-8/#{path}").to_s,
83
+ :body => "こんにちは\n".b,
84
+ :source => data.uri.to_s,
85
+ },
86
+ ],
87
+ decompose(data))
88
+ end
89
+ end
73
90
  end
74
91
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2017-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -63,6 +63,36 @@ class TestDecomposersZip < Test::Unit::TestCase
63
63
  decompose(data_path))
64
64
  end
65
65
 
66
+ sub_test_case("multibyte") do
67
+ test("cp932") do
68
+ data_path = Pathname.new(fixture_path("cp932.zip"))
69
+ base_path = data_path.sub_ext("")
70
+ path = CGI.escape("こんにちは.txt")
71
+ assert_equal([
72
+ {
73
+ :uri => file_uri("#{base_path}/cp932/#{path}").to_s,
74
+ :body => "こんにちは\n".encode("cp932").b,
75
+ :source => file_uri(data_path).to_s,
76
+ },
77
+ ],
78
+ decompose(data_path))
79
+ end
80
+
81
+ test("UTF-8") do
82
+ data_path = Pathname.new(fixture_path("utf-8.zip"))
83
+ base_path = data_path.sub_ext("")
84
+ path = CGI.escape("こんにちは.txt")
85
+ assert_equal([
86
+ {
87
+ :uri => file_uri("#{base_path}/utf-8/#{path}").to_s,
88
+ :body => "こんにちは\n".b,
89
+ :source => file_uri(data_path).to_s,
90
+ },
91
+ ],
92
+ decompose(data_path))
93
+ end
94
+ end
95
+
66
96
  sub_test_case("encrypted") do
67
97
  test("without password") do
68
98
  data_path = Pathname.new(fixture_path("password.zip"))
Binary file
Binary file
Binary file
Binary file
data/test/helper.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -32,4 +32,34 @@ module Helper
32
32
  def file_uri(path)
33
33
  URI.parse("file://#{path}")
34
34
  end
35
+
36
+
37
+ class CaptureLogger
38
+ def initialize(output)
39
+ @output = output
40
+ end
41
+
42
+ def error(message=nil)
43
+ @output << [:error, message || yield]
44
+ end
45
+ end
46
+
47
+ def capture_log
48
+ original_logger = ChupaText.logger
49
+ begin
50
+ output = []
51
+ ChupaText.logger = CaptureLogger.new(output)
52
+ yield
53
+ normalize_log(output)
54
+ ensure
55
+ ChupaText.logger = original_logger
56
+ end
57
+ end
58
+
59
+ def normalize_log(log)
60
+ log.collect do |level, message|
61
+ message = message.split("\n", 2)[0]
62
+ [level, message]
63
+ end
64
+ end
35
65
  end
data/test/test-data.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -44,11 +44,15 @@ class TestData < Test::Unit::TestCase
44
44
 
45
45
  sub_test_case("body") do
46
46
  def test_txt
47
- body = "Hello"
48
- body.force_encoding("ASCII-8BIT")
47
+ body = "Hello".b
49
48
  assert_equal("text/plain", guess(body))
50
49
  end
51
50
 
51
+ def test_utf8_valid_binary
52
+ body = "GROONGA:IO:00001@\0\0\0\0\0\0\0\0\0\0".b
53
+ assert_nil(guess(body))
54
+ end
55
+
52
56
  private
53
57
  def guess(body)
54
58
  @data.body = body
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -121,5 +121,112 @@ class TestExtractor < Test::Unit::TestCase
121
121
  assert_equal(["Hello", "Hello"], extract(data))
122
122
  end
123
123
  end
124
+
125
+ sub_test_case("body") do
126
+ def test_utf8
127
+ data = ChupaText::Data.new
128
+ data.mime_type = "text/plain"
129
+ data.body = "こんにちは"
130
+ assert_equal(["こんにちは"], extract(data))
131
+ end
132
+
133
+ def test_utf8_ascii_8bit
134
+ data = ChupaText::Data.new
135
+ data.mime_type = "text/plain"
136
+ data.body = "こんにちは".b
137
+ assert_equal(["こんにちは"], extract(data))
138
+ end
139
+
140
+ def test_utf8_broken
141
+ data = ChupaText::Data.new
142
+ data.mime_type = "text/plain"
143
+ data.body = "\x82\x00こんにちは".b
144
+ assert_equal(["こんにちは"], extract(data))
145
+ end
146
+
147
+ def test_utf16_le
148
+ data = ChupaText::Data.new
149
+ data.mime_type = "text/plain"
150
+ data.body = "こんにちは".encode("UTF-16LE")
151
+ assert_equal(["こんにちは"], extract(data))
152
+ end
153
+
154
+ def test_utf16_le_ascii_8bit
155
+ data = ChupaText::Data.new
156
+ data.mime_type = "text/plain"
157
+ data.body = "\ufeffこんにちは".encode("UTF-16LE").b
158
+ assert_equal(["こんにちは"], extract(data))
159
+ end
160
+
161
+ def test_utf16_be
162
+ data = ChupaText::Data.new
163
+ data.mime_type = "text/plain"
164
+ data.body = "こんにちは".encode("UTF-16BE")
165
+ assert_equal(["こんにちは"], extract(data))
166
+ end
167
+
168
+ def test_utf16_be_ascii_8bit
169
+ data = ChupaText::Data.new
170
+ data.mime_type = "text/plain"
171
+ data.body = "\ufeffこんにちは".encode("UTF-16BE").b
172
+ assert_equal(["こんにちは"], extract(data))
173
+ end
174
+
175
+ def test_utf32_le
176
+ data = ChupaText::Data.new
177
+ data.mime_type = "text/plain"
178
+ data.body = "こんにちは".encode("UTF-32LE")
179
+ assert_equal(["こんにちは"], extract(data))
180
+ end
181
+
182
+ def test_utf32_le_ascii_8bit
183
+ data = ChupaText::Data.new
184
+ data.mime_type = "text/plain"
185
+ data.body = "\ufeffこんにちは".encode("UTF-32LE").b
186
+ assert_equal(["こんにちは"], extract(data))
187
+ end
188
+
189
+ def test_utf32_be
190
+ data = ChupaText::Data.new
191
+ data.mime_type = "text/plain"
192
+ data.body = "こんにちは".encode("UTF-32BE")
193
+ assert_equal(["こんにちは"], extract(data))
194
+ end
195
+
196
+ def test_utf32_be_ascii_8bit
197
+ data = ChupaText::Data.new
198
+ data.mime_type = "text/plain"
199
+ data.body = "\ufeffこんにちは".encode("UTF-32BE").b
200
+ assert_equal(["こんにちは"], extract(data))
201
+ end
202
+
203
+ def test_cp932
204
+ data = ChupaText::Data.new
205
+ data.mime_type = "text/plain"
206
+ data.body = "こんにちは".encode("cp932")
207
+ assert_equal(["こんにちは"], extract(data))
208
+ end
209
+
210
+ def test_cp932_ascii_8bit
211
+ data = ChupaText::Data.new
212
+ data.mime_type = "text/plain"
213
+ data.body = "こんにちは".encode("cp932").b
214
+ assert_equal(["こんにちは"], extract(data))
215
+ end
216
+
217
+ def test_euc_jp
218
+ data = ChupaText::Data.new
219
+ data.mime_type = "text/plain"
220
+ data.body = "こんにちは".encode("euc-jp")
221
+ assert_equal(["こんにちは"], extract(data))
222
+ end
223
+
224
+ def test_euc_jp_ascii_8bit
225
+ data = ChupaText::Data.new
226
+ data.mime_type = "text/plain"
227
+ data.body = "こんにちは".encode("euc-jp").b
228
+ assert_equal(["こんにちは"], extract(data))
229
+ end
230
+ end
124
231
  end
125
232
  end