chupa-text 1.1.5 → 1.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/chupa-text.gemspec +2 -1
  3. data/doc/text/news.md +42 -0
  4. data/lib/chupa-text/data.rb +19 -2
  5. data/lib/chupa-text/decomposers/csv.rb +20 -4
  6. data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
  8. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
  9. data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
  11. data/lib/chupa-text/decomposers/tar.rb +18 -12
  12. data/lib/chupa-text/decomposers/zip.rb +30 -4
  13. data/lib/chupa-text/extractor.rb +5 -3
  14. data/lib/chupa-text/path-converter.rb +70 -0
  15. data/lib/chupa-text/utf8-converter.rb +117 -0
  16. data/lib/chupa-text/version.rb +1 -1
  17. data/test/command/test-chupa-text.rb +4 -4
  18. data/test/decomposers/test-csv.rb +18 -3
  19. data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
  20. data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
  21. data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
  22. data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
  23. data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
  24. data/test/decomposers/test-tar.rb +18 -1
  25. data/test/decomposers/test-zip.rb +31 -1
  26. data/test/fixture/ods/covered-table-cell.ods +0 -0
  27. data/test/fixture/ods/shapes.ods +0 -0
  28. data/test/fixture/tar/utf-8.tar +0 -0
  29. data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
  30. data/test/fixture/zip/cp932.zip +0 -0
  31. data/test/fixture/zip/utf-8.zip +0 -0
  32. data/test/helper.rb +31 -1
  33. data/test/test-data.rb +7 -3
  34. data/test/test-extractor.rb +108 -1
  35. metadata +29 -7
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -17,6 +17,8 @@
17
17
  require "stringio"
18
18
  require "rubygems/package"
19
19
 
20
+ require "chupa-text/path-converter"
21
+
20
22
  module ChupaText
21
23
  module Decomposers
22
24
  class Tar < Decomposer
@@ -28,18 +30,22 @@ module ChupaText
28
30
  end
29
31
 
30
32
  def decompose(data)
31
- Gem::Package::TarReader.new(StringIO.new(data.body)) do |reader|
32
- reader.each do |entry|
33
- next unless entry.file?
33
+ data.open do |input|
34
+ Gem::Package::TarReader.new(input) do |reader|
35
+ reader.each do |entry|
36
+ next unless entry.file?
34
37
 
35
- entry.extend(CopyStreamable)
36
- entry_uri = data.uri.dup
37
- base_path = entry_uri.path.gsub(/\.tar\z/i, "")
38
- entry_uri.path = "#{base_path}/#{entry.full_name}"
39
- extracted = VirtualFileData.new(entry_uri,
40
- entry,
41
- :source_data => data)
42
- yield(extracted)
38
+ entry.extend(CopyStreamable)
39
+ entry_uri = data.uri.dup
40
+ base_path = entry_uri.path.gsub(/\.tar\z/i, "")
41
+ path_converter = PathConverter.new(entry.full_name,
42
+ uri_escape: true)
43
+ entry_uri.path = "#{base_path}/#{path_converter.convert}"
44
+ extracted = VirtualFileData.new(entry_uri,
45
+ entry,
46
+ :source_data => data)
47
+ yield(extracted)
48
+ end
43
49
  end
44
50
  end
45
51
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2017-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -15,13 +15,16 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  require "stringio"
18
- require "tmpdir"
19
18
 
20
19
  require "archive/zip"
21
20
 
21
+ require "chupa-text/path-converter"
22
+
22
23
  module ChupaText
23
24
  module Decomposers
24
25
  class Zip < Decomposer
26
+ include Loggable
27
+
25
28
  registry.register("zip", self)
26
29
 
27
30
  def target?(data)
@@ -32,7 +35,7 @@ module ChupaText
32
35
  end
33
36
 
34
37
  def decompose(data)
35
- Archive::Zip.open(StringIO.new(data.body)) do |zip|
38
+ open_zip(data) do |zip|
36
39
  zip.each do |entry|
37
40
  next unless entry.file?
38
41
 
@@ -45,7 +48,10 @@ module ChupaText
45
48
  end
46
49
  entry_uri = data.uri.dup
47
50
  base_path = entry_uri.path.gsub(/\.zip\z/i, "")
48
- entry_uri.path = "#{base_path}/#{entry.zip_path}"
51
+ path_converter = PathConverter.new(entry.zip_path,
52
+ encoding: base_path.encoding,
53
+ uri_escape: true)
54
+ entry_uri.path = "#{base_path}/#{path_converter.convert}"
49
55
  entry_data = VirtualFileData.new(entry_uri,
50
56
  entry.file_data,
51
57
  source_data: data)
@@ -53,6 +59,26 @@ module ChupaText
53
59
  end
54
60
  end
55
61
  end
62
+
63
+ private
64
+ def open_zip(data)
65
+ begin
66
+ Archive::Zip.open(StringIO.new(data.body)) do |zip|
67
+ yield(zip)
68
+ end
69
+ rescue Archive::Zip::Error => zip_error
70
+ error do
71
+ message = "#{log_tag} Failed to process zip: "
72
+ message << "#{zip_error.class}: #{zip_error.message}\n"
73
+ message << zip_error.backtrace.join("\n")
74
+ message
75
+ end
76
+ end
77
+ end
78
+
79
+ def log_tag
80
+ "[decomposer][zip]"
81
+ end
56
82
  end
57
83
  end
58
84
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -68,11 +68,13 @@ module ChupaText
68
68
  if decomposer.nil?
69
69
  if target.text_plain?
70
70
  debug {"#{log_tag}[extract][text-plain]"}
71
- yield(target)
71
+ yield(target.to_utf8_body_data)
72
72
  next
73
73
  else
74
74
  debug {"#{log_tag}[extract][decomposer] not found"}
75
- yield(target) if target.text?
75
+ if target.text?
76
+ yield(target.to_utf8_body_data)
77
+ end
76
78
  next
77
79
  end
78
80
  end
@@ -0,0 +1,70 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+
19
+ module ChupaText
20
+ class PathConverter
21
+ def initialize(path, options={})
22
+ @path = path
23
+ @options = options
24
+ end
25
+
26
+ def convert
27
+ path = @path
28
+ encoding = @options[:encoding]
29
+ path = convert_encoding(path, encoding) if encoding
30
+ path = convert_to_uri_path(path) if @options[:uri_escape]
31
+ path
32
+ end
33
+
34
+ private
35
+ def convert_encoding(path, encoding)
36
+ case path.encoding
37
+ when Encoding::ASCII_8BIT
38
+ if path.ascii_only?
39
+ path.force_encoding(Encoding::UTF_8)
40
+ else
41
+ candidates = [
42
+ Encoding::UTF_8,
43
+ Encoding::EUC_JP,
44
+ Encoding::Windows_31J,
45
+ ]
46
+ found = false
47
+ candidates.find do |candidate|
48
+ path.force_encoding(candidate)
49
+ if path.valid_encoding?
50
+ found = true
51
+ break
52
+ end
53
+ end
54
+ path.force_encoding(Encoding::ASCII_8BIT) unless found
55
+ end
56
+ end
57
+ path.encode(encoding,
58
+ invalid: :replace,
59
+ undef: :replace,
60
+ replace: "")
61
+ end
62
+
63
+ def convert_to_uri_path(path)
64
+ converted_components = path.split("/").collect do |component|
65
+ CGI.escape(component)
66
+ end
67
+ converted_components.join("/")
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,117 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class UTF8Converter
19
+ def initialize(string)
20
+ @string = string
21
+ end
22
+
23
+ def convert
24
+ encoding = @string.encoding
25
+ case encoding
26
+ when Encoding::UTF_8
27
+ bom_size, bom_encoding = detect_bom
28
+ if bom_size
29
+ return @string.byteslice(bom_size,
30
+ @string.bytesize - bom_size)
31
+ else
32
+ return @string
33
+ end
34
+ when Encoding::ASCII_8BIT
35
+ return @string if @string.ascii_only?
36
+ else
37
+ return @string.encode(Encoding::UTF_8,
38
+ invalid: :replace,
39
+ undef: :replace,
40
+ replace: "")
41
+ end
42
+
43
+ bom_size, bom_encoding = detect_bom
44
+ if bom_encoding
45
+ string_without_bom = @string.byteslice(bom_size,
46
+ @string.bytesize - bom_size)
47
+ return string_without_bom.encode(Encoding::UTF_8,
48
+ bom_encoding,
49
+ invalid: :replace,
50
+ undef: :replace,
51
+ replace: "")
52
+ end
53
+
54
+ guessed_encoding = guess_encoding
55
+ if guessed_encoding
56
+ @string.encode(Encoding::UTF_8,
57
+ guessed_encoding,
58
+ invalid: :replace,
59
+ undef: :replace,
60
+ replace: "")
61
+ else
62
+ utf8_body = @string.dup
63
+ utf8_body.force_encoding(Encoding::UTF_8)
64
+ utf8_body.scrub!("")
65
+ utf8_body.gsub!(/\p{Control}+/, "")
66
+ utf8_body
67
+ end
68
+ end
69
+
70
+ private
71
+ UTF_8_BOM = "\xef\xbb\xbf".b
72
+ UTF_16BE_BOM = "\xfe\xff".b
73
+ UTF_16LE_BOM = "\xff\xfe".b
74
+ UTF_32BE_BOM = "\x00\x00\xfe\xff".b
75
+ UTF_32LE_BOM = "\xff\xfe\x00\x00".b
76
+ def detect_bom
77
+ case @string.byteslice(0, 4).b
78
+ when UTF_32BE_BOM
79
+ return 4, Encoding::UTF_32BE
80
+ when UTF_32LE_BOM
81
+ return 4, Encoding::UTF_32LE
82
+ end
83
+
84
+ case @string.byteslice(0, 3).b
85
+ when UTF_8_BOM
86
+ return 3, Encoding::UTF_8
87
+ end
88
+
89
+ case @string.byteslice(0, 2).b
90
+ when UTF_16BE_BOM
91
+ return 2, Encoding::UTF_16BE
92
+ when UTF_16LE_BOM
93
+ return 2, Encoding::UTF_16LE
94
+ end
95
+
96
+ nil
97
+ end
98
+
99
+ def guess_encoding
100
+ original_encoding = @string.encoding
101
+ begin
102
+ candidates = [
103
+ Encoding::UTF_8,
104
+ Encoding::EUC_JP,
105
+ Encoding::Windows_31J,
106
+ ]
107
+ candidates.each do |candidate|
108
+ @string.force_encoding(candidate)
109
+ return candidate if @string.valid_encoding?
110
+ end
111
+ nil
112
+ ensure
113
+ @string.force_encoding(original_encoding)
114
+ end
115
+ end
116
+ end
117
+ end
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.1.5"
18
+ VERSION = "1.1.6"
19
19
  end
@@ -289,7 +289,7 @@ class TestCommandChupaText < Test::Unit::TestCase
289
289
  "path" => path.sub_ext(".txt").to_s,
290
290
  "mime-type" => "text/plain",
291
291
  "source-mime-types" => ["text/csv"],
292
- "body" => "1 2 3\n4 5 6\n7 8 9\n",
292
+ "body" => "1\t2\t3\n4\t5\t6\n7\t8\t9\n",
293
293
  "size" => 18,
294
294
  "screenshot" => {
295
295
  "mime-type" => "image/svg+xml",
@@ -304,9 +304,9 @@ class TestCommandChupaText < Test::Unit::TestCase
304
304
  x="0"
305
305
  y="20"
306
306
  style="font-size: 20px; white-space: pre-wrap;"
307
- xml:space="preserve">1 2 3
308
- 4 5 6
309
- 7 8 9
307
+ xml:space="preserve">1\t2\t3
308
+ 4\t5\t6
309
+ 7\t8\t9
310
310
  </text>
311
311
  </svg>
312
312
  SVG
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -22,15 +22,30 @@ class TestDecomposersCSV< Test::Unit::TestCase
22
22
  end
23
23
 
24
24
  sub_test_case("decompose") do
25
- def test_body
25
+ def test_valid
26
26
  csv = <<-CSV
27
27
  Hello,World
28
28
  Ruby,ChupaText
29
29
  CSV
30
- assert_equal([csv.gsub(/,/, " ")],
30
+ assert_equal([csv.gsub(/,/, "\t")],
31
31
  decompose(csv).collect(&:body))
32
32
  end
33
33
 
34
+ def test_invalid
35
+ messages = capture_log do
36
+ assert_equal([], decompose("He\x82\x00llo").collect(&:body))
37
+ end
38
+ assert_equal([
39
+ [
40
+ :error,
41
+ "[decomposer][csv] Failed to parse CSV: " +
42
+ "CSV::MalformedCSVError: " +
43
+ "Invalid byte sequence in UTF-8 in line 1.",
44
+ ],
45
+ ],
46
+ messages)
47
+ end
48
+
34
49
  private
35
50
  def decompose(csv)
36
51
  data = ChupaText::Data.new
@@ -48,41 +48,32 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
48
48
  sub_test_case("#decompose") do
49
49
  sub_test_case("attributes") do
50
50
  def decompose(attribute_name)
51
- super(fixture_path("pptx", "attributes.pptx")).collect do |data|
52
- data[attribute_name]
53
- end
51
+ super(fixture_path("pptx", "attributes.pptx")).first[attribute_name]
54
52
  end
55
53
 
56
54
  def test_title
57
- assert_equal(["Title"], decompose("title"))
55
+ assert_equal("Title", decompose("title"))
58
56
  end
59
57
 
60
58
  def test_author
61
- assert_equal([nil], decompose("author"))
59
+ assert_equal(nil, decompose("author"))
62
60
  end
63
61
 
64
62
  def test_subject
65
- assert_equal(["Subject"], decompose("subject"))
63
+ assert_equal("Subject", decompose("subject"))
66
64
  end
67
65
 
68
66
  def test_keywords
69
- assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
67
+ assert_equal("Keyword1 Keyword2", decompose("keywords"))
70
68
  end
71
69
 
72
70
  def test_modified_time
73
- assert_equal([Time],
74
- decompose("modified_time").collect(&:class))
71
+ assert_equal(Time, decompose("modified_time").class)
75
72
  end
76
73
 
77
74
  def test_application
78
- assert_equal(["LibreOffice"],
79
- normalize_applications(decompose("application")))
80
- end
81
-
82
- def normalize_applications(applications)
83
- applications.collect do |application|
84
- normalize_application(application)
85
- end
75
+ assert_equal("LibreOffice",
76
+ normalize_application(decompose("application")))
86
77
  end
87
78
 
88
79
  def normalize_application(application)
@@ -92,41 +83,50 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
92
83
  application
93
84
  end
94
85
  end
95
-
96
- def test_creation_date
97
- assert_equal([nil], decompose("creation_date"))
98
- end
99
86
  end
100
87
 
101
- sub_test_case("one slide") do
102
- def decompose
103
- super(fixture_path("pptx", "one-slide.pptx"))
104
- end
105
-
106
- def test_body
107
- assert_equal([<<-BODY], decompose.collect(&:body))
108
- Slide1 title
109
- Slide1 content
110
- BODY
88
+ sub_test_case("slides") do
89
+ def decompose(path)
90
+ super(path).collect do |data|
91
+ [
92
+ data["index"],
93
+ data.body,
94
+ ]
95
+ end
111
96
  end
112
- end
113
97
 
114
- sub_test_case("multi slides") do
115
- def decompose
116
- super(fixture_path("pptx", "multi-slides.pptx"))
98
+ def test_one_slide
99
+ assert_equal([
100
+ [nil, ""],
101
+ [
102
+ 0,
103
+ "Slide1 title\n" +
104
+ "Slide1 content\n",
105
+ ],
106
+ ],
107
+ decompose(fixture_path("pptx", "one-slide.pptx")))
117
108
  end
118
109
 
119
- def test_body
120
- assert_equal([<<-BODY], decompose.collect(&:body))
121
- Slide1 title
122
- Slide1 content
123
-
124
- Slide2 title
125
- Slide2 content
126
-
127
- Slide3 title
128
- Slide3 content
129
- BODY
110
+ def test_multi_slides
111
+ assert_equal([
112
+ [nil, ""],
113
+ [
114
+ 0,
115
+ "Slide1 title\n" +
116
+ "Slide1 content\n",
117
+ ],
118
+ [
119
+ 1,
120
+ "Slide2 title\n" +
121
+ "Slide2 content\n",
122
+ ],
123
+ [
124
+ 2,
125
+ "Slide3 title\n" +
126
+ "Slide3 content\n",
127
+ ],
128
+ ],
129
+ decompose(fixture_path("pptx", "multi-slides.pptx")))
130
130
  end
131
131
  end
132
132
  end