chupa-text 1.1.5 → 1.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/chupa-text.gemspec +2 -1
- data/doc/text/news.md +42 -0
- data/lib/chupa-text/data.rb +19 -2
- data/lib/chupa-text/decomposers/csv.rb +20 -4
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
- data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
- data/lib/chupa-text/decomposers/tar.rb +18 -12
- data/lib/chupa-text/decomposers/zip.rb +30 -4
- data/lib/chupa-text/extractor.rb +5 -3
- data/lib/chupa-text/path-converter.rb +70 -0
- data/lib/chupa-text/utf8-converter.rb +117 -0
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +4 -4
- data/test/decomposers/test-csv.rb +18 -3
- data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
- data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
- data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
- data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
- data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
- data/test/decomposers/test-tar.rb +18 -1
- data/test/decomposers/test-zip.rb +31 -1
- data/test/fixture/ods/covered-table-cell.ods +0 -0
- data/test/fixture/ods/shapes.ods +0 -0
- data/test/fixture/tar/utf-8.tar +0 -0
- data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
- data/test/fixture/zip/cp932.zip +0 -0
- data/test/fixture/zip/utf-8.zip +0 -0
- data/test/helper.rb +31 -1
- data/test/test-data.rb +7 -3
- data/test/test-extractor.rb +108 -1
- metadata +29 -7
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -17,6 +17,8 @@
|
|
17
17
|
require "stringio"
|
18
18
|
require "rubygems/package"
|
19
19
|
|
20
|
+
require "chupa-text/path-converter"
|
21
|
+
|
20
22
|
module ChupaText
|
21
23
|
module Decomposers
|
22
24
|
class Tar < Decomposer
|
@@ -28,18 +30,22 @@ module ChupaText
|
|
28
30
|
end
|
29
31
|
|
30
32
|
def decompose(data)
|
31
|
-
|
32
|
-
|
33
|
-
|
33
|
+
data.open do |input|
|
34
|
+
Gem::Package::TarReader.new(input) do |reader|
|
35
|
+
reader.each do |entry|
|
36
|
+
next unless entry.file?
|
34
37
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
38
|
+
entry.extend(CopyStreamable)
|
39
|
+
entry_uri = data.uri.dup
|
40
|
+
base_path = entry_uri.path.gsub(/\.tar\z/i, "")
|
41
|
+
path_converter = PathConverter.new(entry.full_name,
|
42
|
+
uri_escape: true)
|
43
|
+
entry_uri.path = "#{base_path}/#{path_converter.convert}"
|
44
|
+
extracted = VirtualFileData.new(entry_uri,
|
45
|
+
entry,
|
46
|
+
:source_data => data)
|
47
|
+
yield(extracted)
|
48
|
+
end
|
43
49
|
end
|
44
50
|
end
|
45
51
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2017-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -15,13 +15,16 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
require "stringio"
|
18
|
-
require "tmpdir"
|
19
18
|
|
20
19
|
require "archive/zip"
|
21
20
|
|
21
|
+
require "chupa-text/path-converter"
|
22
|
+
|
22
23
|
module ChupaText
|
23
24
|
module Decomposers
|
24
25
|
class Zip < Decomposer
|
26
|
+
include Loggable
|
27
|
+
|
25
28
|
registry.register("zip", self)
|
26
29
|
|
27
30
|
def target?(data)
|
@@ -32,7 +35,7 @@ module ChupaText
|
|
32
35
|
end
|
33
36
|
|
34
37
|
def decompose(data)
|
35
|
-
|
38
|
+
open_zip(data) do |zip|
|
36
39
|
zip.each do |entry|
|
37
40
|
next unless entry.file?
|
38
41
|
|
@@ -45,7 +48,10 @@ module ChupaText
|
|
45
48
|
end
|
46
49
|
entry_uri = data.uri.dup
|
47
50
|
base_path = entry_uri.path.gsub(/\.zip\z/i, "")
|
48
|
-
|
51
|
+
path_converter = PathConverter.new(entry.zip_path,
|
52
|
+
encoding: base_path.encoding,
|
53
|
+
uri_escape: true)
|
54
|
+
entry_uri.path = "#{base_path}/#{path_converter.convert}"
|
49
55
|
entry_data = VirtualFileData.new(entry_uri,
|
50
56
|
entry.file_data,
|
51
57
|
source_data: data)
|
@@ -53,6 +59,26 @@ module ChupaText
|
|
53
59
|
end
|
54
60
|
end
|
55
61
|
end
|
62
|
+
|
63
|
+
private
|
64
|
+
def open_zip(data)
|
65
|
+
begin
|
66
|
+
Archive::Zip.open(StringIO.new(data.body)) do |zip|
|
67
|
+
yield(zip)
|
68
|
+
end
|
69
|
+
rescue Archive::Zip::Error => zip_error
|
70
|
+
error do
|
71
|
+
message = "#{log_tag} Failed to process zip: "
|
72
|
+
message << "#{zip_error.class}: #{zip_error.message}\n"
|
73
|
+
message << zip_error.backtrace.join("\n")
|
74
|
+
message
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def log_tag
|
80
|
+
"[decomposer][zip]"
|
81
|
+
end
|
56
82
|
end
|
57
83
|
end
|
58
84
|
end
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -68,11 +68,13 @@ module ChupaText
|
|
68
68
|
if decomposer.nil?
|
69
69
|
if target.text_plain?
|
70
70
|
debug {"#{log_tag}[extract][text-plain]"}
|
71
|
-
yield(target)
|
71
|
+
yield(target.to_utf8_body_data)
|
72
72
|
next
|
73
73
|
else
|
74
74
|
debug {"#{log_tag}[extract][decomposer] not found"}
|
75
|
-
|
75
|
+
if target.text?
|
76
|
+
yield(target.to_utf8_body_data)
|
77
|
+
end
|
76
78
|
next
|
77
79
|
end
|
78
80
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "cgi/util"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
class PathConverter
|
21
|
+
def initialize(path, options={})
|
22
|
+
@path = path
|
23
|
+
@options = options
|
24
|
+
end
|
25
|
+
|
26
|
+
def convert
|
27
|
+
path = @path
|
28
|
+
encoding = @options[:encoding]
|
29
|
+
path = convert_encoding(path, encoding) if encoding
|
30
|
+
path = convert_to_uri_path(path) if @options[:uri_escape]
|
31
|
+
path
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def convert_encoding(path, encoding)
|
36
|
+
case path.encoding
|
37
|
+
when Encoding::ASCII_8BIT
|
38
|
+
if path.ascii_only?
|
39
|
+
path.force_encoding(Encoding::UTF_8)
|
40
|
+
else
|
41
|
+
candidates = [
|
42
|
+
Encoding::UTF_8,
|
43
|
+
Encoding::EUC_JP,
|
44
|
+
Encoding::Windows_31J,
|
45
|
+
]
|
46
|
+
found = false
|
47
|
+
candidates.find do |candidate|
|
48
|
+
path.force_encoding(candidate)
|
49
|
+
if path.valid_encoding?
|
50
|
+
found = true
|
51
|
+
break
|
52
|
+
end
|
53
|
+
end
|
54
|
+
path.force_encoding(Encoding::ASCII_8BIT) unless found
|
55
|
+
end
|
56
|
+
end
|
57
|
+
path.encode(encoding,
|
58
|
+
invalid: :replace,
|
59
|
+
undef: :replace,
|
60
|
+
replace: "")
|
61
|
+
end
|
62
|
+
|
63
|
+
def convert_to_uri_path(path)
|
64
|
+
converted_components = path.split("/").collect do |component|
|
65
|
+
CGI.escape(component)
|
66
|
+
end
|
67
|
+
converted_components.join("/")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class UTF8Converter
|
19
|
+
def initialize(string)
|
20
|
+
@string = string
|
21
|
+
end
|
22
|
+
|
23
|
+
def convert
|
24
|
+
encoding = @string.encoding
|
25
|
+
case encoding
|
26
|
+
when Encoding::UTF_8
|
27
|
+
bom_size, bom_encoding = detect_bom
|
28
|
+
if bom_size
|
29
|
+
return @string.byteslice(bom_size,
|
30
|
+
@string.bytesize - bom_size)
|
31
|
+
else
|
32
|
+
return @string
|
33
|
+
end
|
34
|
+
when Encoding::ASCII_8BIT
|
35
|
+
return @string if @string.ascii_only?
|
36
|
+
else
|
37
|
+
return @string.encode(Encoding::UTF_8,
|
38
|
+
invalid: :replace,
|
39
|
+
undef: :replace,
|
40
|
+
replace: "")
|
41
|
+
end
|
42
|
+
|
43
|
+
bom_size, bom_encoding = detect_bom
|
44
|
+
if bom_encoding
|
45
|
+
string_without_bom = @string.byteslice(bom_size,
|
46
|
+
@string.bytesize - bom_size)
|
47
|
+
return string_without_bom.encode(Encoding::UTF_8,
|
48
|
+
bom_encoding,
|
49
|
+
invalid: :replace,
|
50
|
+
undef: :replace,
|
51
|
+
replace: "")
|
52
|
+
end
|
53
|
+
|
54
|
+
guessed_encoding = guess_encoding
|
55
|
+
if guessed_encoding
|
56
|
+
@string.encode(Encoding::UTF_8,
|
57
|
+
guessed_encoding,
|
58
|
+
invalid: :replace,
|
59
|
+
undef: :replace,
|
60
|
+
replace: "")
|
61
|
+
else
|
62
|
+
utf8_body = @string.dup
|
63
|
+
utf8_body.force_encoding(Encoding::UTF_8)
|
64
|
+
utf8_body.scrub!("")
|
65
|
+
utf8_body.gsub!(/\p{Control}+/, "")
|
66
|
+
utf8_body
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
UTF_8_BOM = "\xef\xbb\xbf".b
|
72
|
+
UTF_16BE_BOM = "\xfe\xff".b
|
73
|
+
UTF_16LE_BOM = "\xff\xfe".b
|
74
|
+
UTF_32BE_BOM = "\x00\x00\xfe\xff".b
|
75
|
+
UTF_32LE_BOM = "\xff\xfe\x00\x00".b
|
76
|
+
def detect_bom
|
77
|
+
case @string.byteslice(0, 4).b
|
78
|
+
when UTF_32BE_BOM
|
79
|
+
return 4, Encoding::UTF_32BE
|
80
|
+
when UTF_32LE_BOM
|
81
|
+
return 4, Encoding::UTF_32LE
|
82
|
+
end
|
83
|
+
|
84
|
+
case @string.byteslice(0, 3).b
|
85
|
+
when UTF_8_BOM
|
86
|
+
return 3, Encoding::UTF_8
|
87
|
+
end
|
88
|
+
|
89
|
+
case @string.byteslice(0, 2).b
|
90
|
+
when UTF_16BE_BOM
|
91
|
+
return 2, Encoding::UTF_16BE
|
92
|
+
when UTF_16LE_BOM
|
93
|
+
return 2, Encoding::UTF_16LE
|
94
|
+
end
|
95
|
+
|
96
|
+
nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def guess_encoding
|
100
|
+
original_encoding = @string.encoding
|
101
|
+
begin
|
102
|
+
candidates = [
|
103
|
+
Encoding::UTF_8,
|
104
|
+
Encoding::EUC_JP,
|
105
|
+
Encoding::Windows_31J,
|
106
|
+
]
|
107
|
+
candidates.each do |candidate|
|
108
|
+
@string.force_encoding(candidate)
|
109
|
+
return candidate if @string.valid_encoding?
|
110
|
+
end
|
111
|
+
nil
|
112
|
+
ensure
|
113
|
+
@string.force_encoding(original_encoding)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -289,7 +289,7 @@ class TestCommandChupaText < Test::Unit::TestCase
|
|
289
289
|
"path" => path.sub_ext(".txt").to_s,
|
290
290
|
"mime-type" => "text/plain",
|
291
291
|
"source-mime-types" => ["text/csv"],
|
292
|
-
"body" => "1
|
292
|
+
"body" => "1\t2\t3\n4\t5\t6\n7\t8\t9\n",
|
293
293
|
"size" => 18,
|
294
294
|
"screenshot" => {
|
295
295
|
"mime-type" => "image/svg+xml",
|
@@ -304,9 +304,9 @@ class TestCommandChupaText < Test::Unit::TestCase
|
|
304
304
|
x="0"
|
305
305
|
y="20"
|
306
306
|
style="font-size: 20px; white-space: pre-wrap;"
|
307
|
-
xml:space="preserve">1
|
308
|
-
4
|
309
|
-
7
|
307
|
+
xml:space="preserve">1\t2\t3
|
308
|
+
4\t5\t6
|
309
|
+
7\t8\t9
|
310
310
|
</text>
|
311
311
|
</svg>
|
312
312
|
SVG
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -22,15 +22,30 @@ class TestDecomposersCSV< Test::Unit::TestCase
|
|
22
22
|
end
|
23
23
|
|
24
24
|
sub_test_case("decompose") do
|
25
|
-
def
|
25
|
+
def test_valid
|
26
26
|
csv = <<-CSV
|
27
27
|
Hello,World
|
28
28
|
Ruby,ChupaText
|
29
29
|
CSV
|
30
|
-
assert_equal([csv.gsub(/,/, "
|
30
|
+
assert_equal([csv.gsub(/,/, "\t")],
|
31
31
|
decompose(csv).collect(&:body))
|
32
32
|
end
|
33
33
|
|
34
|
+
def test_invalid
|
35
|
+
messages = capture_log do
|
36
|
+
assert_equal([], decompose("He\x82\x00llo").collect(&:body))
|
37
|
+
end
|
38
|
+
assert_equal([
|
39
|
+
[
|
40
|
+
:error,
|
41
|
+
"[decomposer][csv] Failed to parse CSV: " +
|
42
|
+
"CSV::MalformedCSVError: " +
|
43
|
+
"Invalid byte sequence in UTF-8 in line 1.",
|
44
|
+
],
|
45
|
+
],
|
46
|
+
messages)
|
47
|
+
end
|
48
|
+
|
34
49
|
private
|
35
50
|
def decompose(csv)
|
36
51
|
data = ChupaText::Data.new
|
@@ -48,41 +48,32 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
|
|
48
48
|
sub_test_case("#decompose") do
|
49
49
|
sub_test_case("attributes") do
|
50
50
|
def decompose(attribute_name)
|
51
|
-
super(fixture_path("pptx", "attributes.pptx")).
|
52
|
-
data[attribute_name]
|
53
|
-
end
|
51
|
+
super(fixture_path("pptx", "attributes.pptx")).first[attribute_name]
|
54
52
|
end
|
55
53
|
|
56
54
|
def test_title
|
57
|
-
assert_equal(
|
55
|
+
assert_equal("Title", decompose("title"))
|
58
56
|
end
|
59
57
|
|
60
58
|
def test_author
|
61
|
-
assert_equal(
|
59
|
+
assert_equal(nil, decompose("author"))
|
62
60
|
end
|
63
61
|
|
64
62
|
def test_subject
|
65
|
-
assert_equal(
|
63
|
+
assert_equal("Subject", decompose("subject"))
|
66
64
|
end
|
67
65
|
|
68
66
|
def test_keywords
|
69
|
-
assert_equal(
|
67
|
+
assert_equal("Keyword1 Keyword2", decompose("keywords"))
|
70
68
|
end
|
71
69
|
|
72
70
|
def test_modified_time
|
73
|
-
assert_equal(
|
74
|
-
decompose("modified_time").collect(&:class))
|
71
|
+
assert_equal(Time, decompose("modified_time").class)
|
75
72
|
end
|
76
73
|
|
77
74
|
def test_application
|
78
|
-
assert_equal(
|
79
|
-
|
80
|
-
end
|
81
|
-
|
82
|
-
def normalize_applications(applications)
|
83
|
-
applications.collect do |application|
|
84
|
-
normalize_application(application)
|
85
|
-
end
|
75
|
+
assert_equal("LibreOffice",
|
76
|
+
normalize_application(decompose("application")))
|
86
77
|
end
|
87
78
|
|
88
79
|
def normalize_application(application)
|
@@ -92,41 +83,50 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
|
|
92
83
|
application
|
93
84
|
end
|
94
85
|
end
|
95
|
-
|
96
|
-
def test_creation_date
|
97
|
-
assert_equal([nil], decompose("creation_date"))
|
98
|
-
end
|
99
86
|
end
|
100
87
|
|
101
|
-
sub_test_case("
|
102
|
-
def decompose
|
103
|
-
super(
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
Slide1 content
|
110
|
-
BODY
|
88
|
+
sub_test_case("slides") do
|
89
|
+
def decompose(path)
|
90
|
+
super(path).collect do |data|
|
91
|
+
[
|
92
|
+
data["index"],
|
93
|
+
data.body,
|
94
|
+
]
|
95
|
+
end
|
111
96
|
end
|
112
|
-
end
|
113
97
|
|
114
|
-
|
115
|
-
|
116
|
-
|
98
|
+
def test_one_slide
|
99
|
+
assert_equal([
|
100
|
+
[nil, ""],
|
101
|
+
[
|
102
|
+
0,
|
103
|
+
"Slide1 title\n" +
|
104
|
+
"Slide1 content\n",
|
105
|
+
],
|
106
|
+
],
|
107
|
+
decompose(fixture_path("pptx", "one-slide.pptx")))
|
117
108
|
end
|
118
109
|
|
119
|
-
def
|
120
|
-
assert_equal([
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
110
|
+
def test_multi_slides
|
111
|
+
assert_equal([
|
112
|
+
[nil, ""],
|
113
|
+
[
|
114
|
+
0,
|
115
|
+
"Slide1 title\n" +
|
116
|
+
"Slide1 content\n",
|
117
|
+
],
|
118
|
+
[
|
119
|
+
1,
|
120
|
+
"Slide2 title\n" +
|
121
|
+
"Slide2 content\n",
|
122
|
+
],
|
123
|
+
[
|
124
|
+
2,
|
125
|
+
"Slide3 title\n" +
|
126
|
+
"Slide3 content\n",
|
127
|
+
],
|
128
|
+
],
|
129
|
+
decompose(fixture_path("pptx", "multi-slides.pptx")))
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|