chupa-text 1.1.7 → 1.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +20 -0
- data/lib/chupa-text.rb +1 -0
- data/lib/chupa-text/command/chupa-text.rb +8 -2
- data/lib/chupa-text/data.rb +14 -4
- data/lib/chupa-text/decomposers/gzip.rb +23 -2
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +4 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +4 -0
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +4 -0
- data/lib/chupa-text/decomposers/office-open-xml.rb +23 -24
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +4 -0
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +4 -0
- data/lib/chupa-text/decomposers/opendocument-text.rb +4 -0
- data/lib/chupa-text/decomposers/opendocument.rb +20 -17
- data/lib/chupa-text/decomposers/xml.rb +36 -9
- data/lib/chupa-text/decomposers/zip.rb +2 -20
- data/lib/chupa-text/extractor.rb +33 -33
- data/lib/chupa-text/input-data.rb +4 -0
- data/lib/chupa-text/sax-parser.rb +24 -2
- data/lib/chupa-text/unzippable.rb +39 -0
- data/lib/chupa-text/utf8-converter.rb +42 -24
- data/lib/chupa-text/version.rb +1 -1
- data/test/decomposers/test-gzip.rb +19 -2
- data/test/decomposers/test-office-open-xml-document.rb +18 -0
- data/test/decomposers/test-office-open-xml-presentation.rb +18 -0
- data/test/decomposers/test-office-open-xml-workbook.rb +18 -0
- data/test/decomposers/test-opendocument-presentation.rb +18 -0
- data/test/decomposers/test-opendocument-spreadsheet.rb +18 -0
- data/test/decomposers/test-opendocument-text.rb +18 -0
- data/test/decomposers/test-xml.rb +41 -2
- data/test/fixture/docx/empty.docx +0 -0
- data/test/fixture/odp/empty.odp +0 -0
- data/test/fixture/ods/empty.ods +0 -0
- data/test/fixture/odt/empty.odt +0 -0
- data/test/fixture/pptx/empty.pptx +0 -0
- data/test/fixture/xlsx/empty.xlsx +0 -0
- data/test/test-extractor.rb +10 -0
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 99f896b313c052ecad3ea86ac6009c161cb2b3216b828aaec67da9dc886c5fbe
|
4
|
+
data.tar.gz: 986e35e1c19faa63652d0d8f43c9bd73e527e1d7af2593eea4525ee473414aa1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 962932aed44748ca3c4809434d3e943ab5a113a6f57f058d29f7c60e6d1090f07d272d85221a5f62e9459f8a84f05725c7d5d33b15564d224dadb20215c155ae
|
7
|
+
data.tar.gz: 65face75476f34016b17b64cd0d843e305b87052192d944f2e60f5f06ea91b218e6d4fce1271eacf8f4de7152113fb87de86665b12b9fc479e80789dd0e428dd
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,25 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.1.8: 2019-03-03
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `gzip`: Added error checks.
|
8
|
+
|
9
|
+
* `xml`:
|
10
|
+
|
11
|
+
* Added error checks.
|
12
|
+
|
13
|
+
* Added support for Nokogiri as an alternative backend.
|
14
|
+
|
15
|
+
* Reduced memory usage.
|
16
|
+
|
17
|
+
* Added support for body size limitation.
|
18
|
+
|
19
|
+
* `opendocument`: Added error checks.
|
20
|
+
|
21
|
+
* `office-open-xml`: Added error checks.
|
22
|
+
|
3
23
|
## 1.1.7: 2019-03-01
|
4
24
|
|
5
25
|
### Improvements
|
data/lib/chupa-text.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -49,6 +49,7 @@ module ChupaText
|
|
49
49
|
@mime_formatter_options = {}
|
50
50
|
@need_screenshot = true
|
51
51
|
@expected_screenshot_size = [200, 200]
|
52
|
+
@max_body_size = nil
|
52
53
|
end
|
53
54
|
|
54
55
|
def run(*arguments)
|
@@ -143,6 +144,11 @@ module ChupaText
|
|
143
144
|
"(default: #{@expected_screenshot_size.join("x")})") do |size|
|
144
145
|
@expected_screenshot_size = size
|
145
146
|
end
|
147
|
+
parser.on("--max-body-size=BYTE", Integer,
|
148
|
+
"The max byte of extracted body.",
|
149
|
+
"(default: no limit)") do |size|
|
150
|
+
@max_body_size = size
|
151
|
+
end
|
146
152
|
|
147
153
|
parser.separator("")
|
148
154
|
parser.separator("Log related options:")
|
@@ -190,7 +196,7 @@ module ChupaText
|
|
190
196
|
end
|
191
197
|
|
192
198
|
def create_extractor
|
193
|
-
extractor = Extractor.new
|
199
|
+
extractor = Extractor.new(max_body_size: @max_body_size)
|
194
200
|
extractor.apply_configuration(@configuration)
|
195
201
|
extractor
|
196
202
|
end
|
data/lib/chupa-text/data.rb
CHANGED
@@ -141,7 +141,9 @@ module ChupaText
|
|
141
141
|
end
|
142
142
|
|
143
143
|
def peek_body(size)
|
144
|
-
body
|
144
|
+
_body = body
|
145
|
+
return nil if _body.nil?
|
146
|
+
_body[0, size]
|
145
147
|
end
|
146
148
|
|
147
149
|
def [](name)
|
@@ -196,12 +198,20 @@ module ChupaText
|
|
196
198
|
@need_screenshot
|
197
199
|
end
|
198
200
|
|
199
|
-
def to_utf8_body_data
|
200
|
-
b =
|
201
|
+
def to_utf8_body_data(max_body_size: nil)
|
202
|
+
b = nil
|
203
|
+
if max_body_size
|
204
|
+
open do |input|
|
205
|
+
b = input.read(max_body_size)
|
206
|
+
end
|
207
|
+
else
|
208
|
+
b = body
|
209
|
+
end
|
201
210
|
return self if b.nil?
|
211
|
+
|
202
212
|
converter = UTF8Converter.new(b)
|
203
213
|
utf8_body = converter.convert
|
204
|
-
if b.equal?(utf8_body)
|
214
|
+
if max_body_size.nil? and b.equal?(utf8_body)
|
205
215
|
self
|
206
216
|
else
|
207
217
|
TextData.new(utf8_body, source_data: self)
|
@@ -19,6 +19,8 @@ require "zlib"
|
|
19
19
|
module ChupaText
|
20
20
|
module Decomposers
|
21
21
|
class Gzip < Decomposer
|
22
|
+
include Loggable
|
23
|
+
|
22
24
|
registry.register("gzip", self)
|
23
25
|
|
24
26
|
TARGET_EXTENSIONS = ["gz", "tgz"]
|
@@ -33,8 +35,7 @@ module ChupaText
|
|
33
35
|
end
|
34
36
|
|
35
37
|
def decompose(data)
|
36
|
-
data
|
37
|
-
reader = Zlib::GzipReader.new(input)
|
38
|
+
open_reader(data) do |reader|
|
38
39
|
uri = nil
|
39
40
|
case data.extension
|
40
41
|
when "gz"
|
@@ -46,6 +47,26 @@ module ChupaText
|
|
46
47
|
yield(extracted)
|
47
48
|
end
|
48
49
|
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def open_reader(data)
|
53
|
+
data.open do |input|
|
54
|
+
begin
|
55
|
+
yield(Zlib::GzipReader.new(input))
|
56
|
+
rescue Zlib::Error => zlib_error
|
57
|
+
error do
|
58
|
+
message = "#{log_tag} Failed to uncompress: "
|
59
|
+
message << "#{zlib_error.class}: #{zlib_error.message}\n"
|
60
|
+
message << zlib_error.backtrace.join("\n")
|
61
|
+
message
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def log_tag
|
68
|
+
"[decomposer][gzip]"
|
69
|
+
end
|
49
70
|
end
|
50
71
|
end
|
51
72
|
end
|
@@ -14,13 +14,14 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "archive/zip"
|
18
|
-
|
19
17
|
require "chupa-text/sax-parser"
|
20
18
|
|
21
19
|
module ChupaText
|
22
20
|
module Decomposers
|
23
21
|
class OfficeOpenXML < Decomposer
|
22
|
+
include Loggable
|
23
|
+
include Unzippable
|
24
|
+
|
24
25
|
def target?(data)
|
25
26
|
@extensions.include?(data.extension) or
|
26
27
|
@mime_types.include?(data.mime_type)
|
@@ -35,29 +36,27 @@ module ChupaText
|
|
35
36
|
end
|
36
37
|
|
37
38
|
def decompose(data, &block)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
process_entry(entry, context)
|
56
|
-
end
|
39
|
+
unzip(data) do |zip|
|
40
|
+
context = {
|
41
|
+
data: data,
|
42
|
+
attributes: {},
|
43
|
+
}
|
44
|
+
start_decompose(context)
|
45
|
+
zip.each do |entry|
|
46
|
+
next unless entry.file?
|
47
|
+
case entry.zip_path
|
48
|
+
when "docProps/app.xml"
|
49
|
+
listener = AttributesListener.new(context[:attributes])
|
50
|
+
parse(entry.file_data, listener)
|
51
|
+
when "docProps/core.xml"
|
52
|
+
listener = AttributesListener.new(context[:attributes])
|
53
|
+
parse(entry.file_data, listener)
|
54
|
+
else
|
55
|
+
process_entry(entry, context)
|
57
56
|
end
|
58
57
|
end
|
58
|
+
finish_decompose(context, &block)
|
59
59
|
end
|
60
|
-
finish_decompose(context, &block)
|
61
60
|
end
|
62
61
|
|
63
62
|
private
|
@@ -71,8 +70,8 @@ module ChupaText
|
|
71
70
|
parse(entry.file_data, listener)
|
72
71
|
end
|
73
72
|
|
74
|
-
def
|
75
|
-
|
73
|
+
def log_tag
|
74
|
+
"[decomposer][office-open-xml]"
|
76
75
|
end
|
77
76
|
|
78
77
|
class TextListener < SAXListener
|
@@ -14,13 +14,14 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "archive/zip"
|
18
|
-
|
19
17
|
require "chupa-text/sax-parser"
|
20
18
|
|
21
19
|
module ChupaText
|
22
20
|
module Decomposers
|
23
21
|
class OpenDocument < Decomposer
|
22
|
+
include Loggable
|
23
|
+
include Unzippable
|
24
|
+
|
24
25
|
def target?(data)
|
25
26
|
data.extension == @extension or
|
26
27
|
data.mime_type == @mime_type
|
@@ -35,24 +36,22 @@ module ChupaText
|
|
35
36
|
end
|
36
37
|
|
37
38
|
def decompose(data, &block)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
process_meta(entry, context, &block)
|
51
|
-
end
|
39
|
+
unzip(data) do |zip|
|
40
|
+
context = {
|
41
|
+
data: data,
|
42
|
+
attributes: {},
|
43
|
+
}
|
44
|
+
zip.each do |entry|
|
45
|
+
next unless entry.file?
|
46
|
+
case entry.zip_path
|
47
|
+
when "content.xml"
|
48
|
+
process_content(entry, context, &block)
|
49
|
+
when "meta.xml"
|
50
|
+
process_meta(entry, context, &block)
|
52
51
|
end
|
53
52
|
end
|
53
|
+
finish_decompose(context, &block)
|
54
54
|
end
|
55
|
-
finish_decompose(context, &block)
|
56
55
|
end
|
57
56
|
|
58
57
|
private
|
@@ -66,6 +65,10 @@ module ChupaText
|
|
66
65
|
parse(entry.file_data, listener)
|
67
66
|
end
|
68
67
|
|
68
|
+
def log_tag
|
69
|
+
"[decomposer][opendocument]"
|
70
|
+
end
|
71
|
+
|
69
72
|
class AttributesListener < SAXListener
|
70
73
|
META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
71
74
|
DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,12 +14,13 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "
|
18
|
-
require "rexml/streamlistener"
|
17
|
+
require "chupa-text/sax-parser"
|
19
18
|
|
20
19
|
module ChupaText
|
21
20
|
module Decomposers
|
22
21
|
class XML < Decomposer
|
22
|
+
include Loggable
|
23
|
+
|
23
24
|
registry.register("xml", self)
|
24
25
|
|
25
26
|
def target?(data)
|
@@ -31,22 +32,48 @@ module ChupaText
|
|
31
32
|
text = ""
|
32
33
|
listener = Listener.new(text)
|
33
34
|
data.open do |input|
|
34
|
-
|
35
|
-
|
35
|
+
begin
|
36
|
+
parser = SAXParser.new(input, listener)
|
37
|
+
parser.parse
|
38
|
+
rescue SAXParser::ParseError => xml_error
|
39
|
+
error do
|
40
|
+
message = "#{log_tag} Failed to parse XML: "
|
41
|
+
message << "#{xml_error.class}: #{xml_error.message}\n"
|
42
|
+
message << xml_error.backtrace.join("\n")
|
43
|
+
message
|
44
|
+
end
|
45
|
+
return
|
46
|
+
end
|
36
47
|
end
|
37
48
|
text_data = TextData.new(text, :source_data => data)
|
38
49
|
yield(text_data)
|
39
50
|
end
|
40
51
|
|
41
|
-
|
42
|
-
|
52
|
+
private
|
53
|
+
def log_tag
|
54
|
+
"[decomposer][xml]"
|
55
|
+
end
|
43
56
|
|
57
|
+
class Listener < SAXListener
|
44
58
|
def initialize(output)
|
45
59
|
@output = output
|
60
|
+
@level = 0
|
61
|
+
end
|
62
|
+
|
63
|
+
def start_element(*args)
|
64
|
+
@level += 1
|
65
|
+
end
|
66
|
+
|
67
|
+
def end_element(*args)
|
68
|
+
@level -= 1
|
69
|
+
end
|
70
|
+
|
71
|
+
def characters(text)
|
72
|
+
@output << text if @level > 0
|
46
73
|
end
|
47
74
|
|
48
|
-
def
|
49
|
-
@output <<
|
75
|
+
def cdata(content)
|
76
|
+
@output << content if @level > 0
|
50
77
|
end
|
51
78
|
end
|
52
79
|
end
|
@@ -14,14 +14,13 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "archive/zip"
|
18
|
-
|
19
17
|
require "chupa-text/path-converter"
|
20
18
|
|
21
19
|
module ChupaText
|
22
20
|
module Decomposers
|
23
21
|
class Zip < Decomposer
|
24
22
|
include Loggable
|
23
|
+
include Unzippable
|
25
24
|
|
26
25
|
registry.register("zip", self)
|
27
26
|
|
@@ -33,7 +32,7 @@ module ChupaText
|
|
33
32
|
end
|
34
33
|
|
35
34
|
def decompose(data)
|
36
|
-
|
35
|
+
unzip(data) do |zip|
|
37
36
|
zip.each do |entry|
|
38
37
|
next unless entry.file?
|
39
38
|
|
@@ -68,23 +67,6 @@ module ChupaText
|
|
68
67
|
end
|
69
68
|
|
70
69
|
private
|
71
|
-
def open_zip(data)
|
72
|
-
begin
|
73
|
-
data.open do |input|
|
74
|
-
Archive::Zip.open(input) do |zip|
|
75
|
-
yield(zip)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
rescue Archive::Zip::Error => zip_error
|
79
|
-
error do
|
80
|
-
message = "#{log_tag} Failed to process zip: "
|
81
|
-
message << "#{zip_error.class}: #{zip_error.message}\n"
|
82
|
-
message << zip_error.backtrace.join("\n")
|
83
|
-
message
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
70
|
def log_tag
|
89
71
|
"[decomposer][zip]"
|
90
72
|
end
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -21,8 +21,9 @@ module ChupaText
|
|
21
21
|
class Extractor
|
22
22
|
include Loggable
|
23
23
|
|
24
|
-
def initialize
|
24
|
+
def initialize(max_body_size: nil)
|
25
25
|
@decomposers = []
|
26
|
+
@max_body_size = max_body_size
|
26
27
|
end
|
27
28
|
|
28
29
|
# Sets the extractor up by the configuration. It adds decomposers
|
@@ -57,38 +58,8 @@ module ChupaText
|
|
57
58
|
# You can get text data by `text_data.body`.
|
58
59
|
#
|
59
60
|
# @return [void]
|
60
|
-
def extract(input)
|
61
|
-
|
62
|
-
until targets.empty?
|
63
|
-
target = targets.shift
|
64
|
-
debug do
|
65
|
-
"#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>"
|
66
|
-
end
|
67
|
-
decomposer = find_decomposer(target)
|
68
|
-
if decomposer.nil?
|
69
|
-
if target.text_plain?
|
70
|
-
debug {"#{log_tag}[extract][text-plain]"}
|
71
|
-
yield(target.to_utf8_body_data)
|
72
|
-
next
|
73
|
-
else
|
74
|
-
debug {"#{log_tag}[extract][decomposer] not found"}
|
75
|
-
if target.text?
|
76
|
-
yield(target.to_utf8_body_data)
|
77
|
-
end
|
78
|
-
next
|
79
|
-
end
|
80
|
-
end
|
81
|
-
debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
|
82
|
-
decomposer.decompose(target) do |decomposed|
|
83
|
-
debug do
|
84
|
-
"#{log_tag}[extract][decomposed] " +
|
85
|
-
"#{decomposer.class}: " +
|
86
|
-
"<#{target.uri}>: " +
|
87
|
-
"<#{target.mime_type}> -> <#{decomposed.mime_type}>"
|
88
|
-
end
|
89
|
-
targets.push(decomposed)
|
90
|
-
end
|
91
|
-
end
|
61
|
+
def extract(input, &block)
|
62
|
+
extract_recursive(ensure_data(input), &block)
|
92
63
|
end
|
93
64
|
|
94
65
|
private
|
@@ -112,6 +83,35 @@ module ChupaText
|
|
112
83
|
candidate[1]
|
113
84
|
end
|
114
85
|
|
86
|
+
def extract_recursive(target, &block)
|
87
|
+
debug do
|
88
|
+
"#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>"
|
89
|
+
end
|
90
|
+
decomposer = find_decomposer(target)
|
91
|
+
if decomposer.nil?
|
92
|
+
if target.text_plain?
|
93
|
+
debug {"#{log_tag}[extract][text-plain]"}
|
94
|
+
yield(target.to_utf8_body_data(max_body_size: @max_body_size))
|
95
|
+
else
|
96
|
+
debug {"#{log_tag}[extract][decomposer] not found"}
|
97
|
+
if target.text?
|
98
|
+
yield(target.to_utf8_body_data(max_body_size: @max_body_size))
|
99
|
+
end
|
100
|
+
end
|
101
|
+
else
|
102
|
+
debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
|
103
|
+
decomposer.decompose(target) do |decomposed|
|
104
|
+
debug do
|
105
|
+
"#{log_tag}[extract][decomposed] " +
|
106
|
+
"#{decomposer.class}: " +
|
107
|
+
"<#{target.uri}>: " +
|
108
|
+
"<#{target.mime_type}> -> <#{decomposed.mime_type}>"
|
109
|
+
end
|
110
|
+
extract_recursive(decomposed, &block)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
115
|
def log_tag
|
116
116
|
"[extractor]"
|
117
117
|
end
|
@@ -25,6 +25,9 @@ end
|
|
25
25
|
|
26
26
|
module ChupaText
|
27
27
|
class SAXParser
|
28
|
+
class ParseError < Error
|
29
|
+
end
|
30
|
+
|
28
31
|
class << self
|
29
32
|
def backend
|
30
33
|
case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
|
@@ -94,6 +97,10 @@ module ChupaText
|
|
94
97
|
@listener.cdata(content)
|
95
98
|
end
|
96
99
|
|
100
|
+
def error(detail)
|
101
|
+
raise ParseError, detail
|
102
|
+
end
|
103
|
+
|
97
104
|
private
|
98
105
|
def build_qname(prefix, local_name)
|
99
106
|
if prefix
|
@@ -105,10 +112,25 @@ module ChupaText
|
|
105
112
|
end
|
106
113
|
else
|
107
114
|
def parse
|
108
|
-
source =
|
115
|
+
source = @input
|
116
|
+
if source.is_a?(Archive::Zip::Codec::Deflate::Decompress)
|
117
|
+
source = source.read
|
118
|
+
end
|
109
119
|
parser = REXML::Parsers::SAX2Parser.new(source)
|
110
120
|
parser.listen(Listener.new(@listener))
|
111
|
-
|
121
|
+
begin
|
122
|
+
parser.parse
|
123
|
+
rescue REXML::ParseException => error
|
124
|
+
message = "#{error.class}: #{error.message}"
|
125
|
+
raise ParseError, message
|
126
|
+
rescue ArgumentError => error
|
127
|
+
if error.message.start_with?("invalid byte sequence")
|
128
|
+
message = "#{error.class}: #{error.message}"
|
129
|
+
raise ParseError, message
|
130
|
+
else
|
131
|
+
raise
|
132
|
+
end
|
133
|
+
end
|
112
134
|
end
|
113
135
|
|
114
136
|
class Listener
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "archive/zip"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Unzippable
|
21
|
+
private
|
22
|
+
def unzip(data)
|
23
|
+
data.open do |input|
|
24
|
+
begin
|
25
|
+
Archive::Zip.open(input) do |zip|
|
26
|
+
yield(zip)
|
27
|
+
end
|
28
|
+
rescue Archive::Zip::Error => zip_error
|
29
|
+
error do
|
30
|
+
message = "#{log_tag} Failed to process zip: "
|
31
|
+
message << "#{zip_error.class}: #{zip_error.message}\n"
|
32
|
+
message << zip_error.backtrace.join("\n")
|
33
|
+
message
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -16,8 +16,9 @@
|
|
16
16
|
|
17
17
|
module ChupaText
|
18
18
|
class UTF8Converter
|
19
|
-
def initialize(string)
|
19
|
+
def initialize(string, max_size: nil)
|
20
20
|
@string = string
|
21
|
+
@max_size = max_size
|
21
22
|
end
|
22
23
|
|
23
24
|
def convert
|
@@ -26,44 +27,51 @@ module ChupaText
|
|
26
27
|
when Encoding::UTF_8
|
27
28
|
bom_size, bom_encoding = detect_bom
|
28
29
|
if bom_size
|
29
|
-
|
30
|
-
|
30
|
+
utf8_string = @string.byteslice(bom_size,
|
31
|
+
@string.bytesize - bom_size)
|
31
32
|
else
|
32
|
-
|
33
|
+
utf8_string = @string
|
33
34
|
end
|
35
|
+
return truncate(utf8_string)
|
34
36
|
when Encoding::ASCII_8BIT
|
35
|
-
return @string if @string.ascii_only?
|
37
|
+
return truncate(@string) if @string.ascii_only?
|
36
38
|
else
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
39
|
+
utf8_string = @string.encode(Encoding::UTF_8,
|
40
|
+
invalid: :replace,
|
41
|
+
undef: :replace,
|
42
|
+
replace: "")
|
43
|
+
return truncate(utf8_string)
|
41
44
|
end
|
42
45
|
|
43
46
|
bom_size, bom_encoding = detect_bom
|
44
47
|
if bom_encoding
|
45
48
|
string_without_bom = @string.byteslice(bom_size,
|
46
49
|
@string.bytesize - bom_size)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
50
|
+
utf8_string = string_without_bom.encode(Encoding::UTF_8,
|
51
|
+
bom_encoding,
|
52
|
+
invalid: :replace,
|
53
|
+
undef: :replace,
|
54
|
+
replace: "")
|
55
|
+
return truncate(utf8_string)
|
52
56
|
end
|
53
57
|
|
54
58
|
guessed_encoding = guess_encoding
|
55
59
|
if guessed_encoding
|
56
|
-
@string.encode(Encoding::UTF_8,
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
60
|
+
truncate(@string.encode(Encoding::UTF_8,
|
61
|
+
guessed_encoding,
|
62
|
+
invalid: :replace,
|
63
|
+
undef: :replace,
|
64
|
+
replace: ""))
|
61
65
|
else
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
66
|
+
if @max_size
|
67
|
+
utf8_string = @string.byteslice(0, @max_size)
|
68
|
+
else
|
69
|
+
utf8_string = @string.dup
|
70
|
+
end
|
71
|
+
utf8_string.force_encoding(Encoding::UTF_8)
|
72
|
+
utf8_string.scrub!("")
|
73
|
+
utf8_string.gsub!(/\p{Control}+/, "")
|
74
|
+
utf8_string
|
67
75
|
end
|
68
76
|
end
|
69
77
|
|
@@ -113,5 +121,15 @@ module ChupaText
|
|
113
121
|
@string.force_encoding(original_encoding)
|
114
122
|
end
|
115
123
|
end
|
124
|
+
|
125
|
+
def truncate(string)
|
126
|
+
if @max_size and string.bytesize > @max_size
|
127
|
+
truncated = string.byteslice(0, @max_size)
|
128
|
+
truncated.scrub!("")
|
129
|
+
truncated
|
130
|
+
else
|
131
|
+
string
|
132
|
+
end
|
133
|
+
end
|
116
134
|
end
|
117
135
|
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -83,7 +83,6 @@ class TestDecomposersGzip < Test::Unit::TestCase
|
|
83
83
|
end
|
84
84
|
end
|
85
85
|
|
86
|
-
|
87
86
|
sub_test_case("tgz") do
|
88
87
|
def setup
|
89
88
|
super
|
@@ -109,5 +108,23 @@ class TestDecomposersGzip < Test::Unit::TestCase
|
|
109
108
|
decompose(@data).collect(&:source))
|
110
109
|
end
|
111
110
|
end
|
111
|
+
|
112
|
+
def test_invalid
|
113
|
+
data = ChupaText::Data.new
|
114
|
+
data.body = "Hello"
|
115
|
+
data.size = data.body.bytesize
|
116
|
+
data.mime_type = "application/gzip"
|
117
|
+
messages = capture_log do
|
118
|
+
assert_equal([], decompose(data).collect(&:body))
|
119
|
+
end
|
120
|
+
assert_equal([
|
121
|
+
[
|
122
|
+
:error,
|
123
|
+
"[decomposer][gzip] Failed to uncompress: " +
|
124
|
+
"Zlib::GzipFile::Error: not in gzip format",
|
125
|
+
],
|
126
|
+
],
|
127
|
+
messages)
|
128
|
+
end
|
112
129
|
end
|
113
130
|
end
|
@@ -140,5 +140,23 @@ Single quote: ''
|
|
140
140
|
BODY
|
141
141
|
end
|
142
142
|
end
|
143
|
+
|
144
|
+
sub_test_case("invalid") do
|
145
|
+
def test_empty
|
146
|
+
messages = capture_log do
|
147
|
+
assert_equal([], decompose(fixture_path("docx", "empty.docx")))
|
148
|
+
end
|
149
|
+
assert_equal([
|
150
|
+
[
|
151
|
+
:error,
|
152
|
+
"[decomposer][office-open-xml][document] " +
|
153
|
+
"Failed to process zip: " +
|
154
|
+
"Archive::Zip::UnzipError: " +
|
155
|
+
"unable to locate end-of-central-directory record",
|
156
|
+
],
|
157
|
+
],
|
158
|
+
messages)
|
159
|
+
end
|
160
|
+
end
|
143
161
|
end
|
144
162
|
end
|
@@ -129,5 +129,23 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
|
|
129
129
|
decompose(fixture_path("pptx", "multi-slides.pptx")))
|
130
130
|
end
|
131
131
|
end
|
132
|
+
|
133
|
+
sub_test_case("invalid") do
|
134
|
+
def test_empty
|
135
|
+
messages = capture_log do
|
136
|
+
assert_equal([], decompose(fixture_path("pptx", "empty.pptx")))
|
137
|
+
end
|
138
|
+
assert_equal([
|
139
|
+
[
|
140
|
+
:error,
|
141
|
+
"[decomposer][office-open-xml][presentation] " +
|
142
|
+
"Failed to process zip: " +
|
143
|
+
"Archive::Zip::UnzipError: " +
|
144
|
+
"unable to locate end-of-central-directory record",
|
145
|
+
],
|
146
|
+
],
|
147
|
+
messages)
|
148
|
+
end
|
149
|
+
end
|
132
150
|
end
|
133
151
|
end
|
@@ -152,5 +152,23 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
|
152
152
|
decompose(fixture_path("xlsx", "multi-sheets.xlsx")))
|
153
153
|
end
|
154
154
|
end
|
155
|
+
|
156
|
+
sub_test_case("invalid") do
|
157
|
+
def test_empty
|
158
|
+
messages = capture_log do
|
159
|
+
assert_equal([], decompose(fixture_path("xlsx", "empty.xlsx")))
|
160
|
+
end
|
161
|
+
assert_equal([
|
162
|
+
[
|
163
|
+
:error,
|
164
|
+
"[decomposer][office-open-xml][workbook] " +
|
165
|
+
"Failed to process zip: " +
|
166
|
+
"Archive::Zip::UnzipError: " +
|
167
|
+
"unable to locate end-of-central-directory record",
|
168
|
+
],
|
169
|
+
],
|
170
|
+
messages)
|
171
|
+
end
|
172
|
+
end
|
155
173
|
end
|
156
174
|
end
|
@@ -132,5 +132,23 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase
|
|
132
132
|
decompose.collect {|data| [data["index"], data.body]})
|
133
133
|
end
|
134
134
|
end
|
135
|
+
|
136
|
+
sub_test_case("invalid") do
|
137
|
+
def test_empty
|
138
|
+
messages = capture_log do
|
139
|
+
assert_equal([], decompose(fixture_path("odp", "empty.odp")))
|
140
|
+
end
|
141
|
+
assert_equal([
|
142
|
+
[
|
143
|
+
:error,
|
144
|
+
"[decomposer][opendocument][presentation] " +
|
145
|
+
"Failed to process zip: " +
|
146
|
+
"Archive::Zip::UnzipError: " +
|
147
|
+
"unable to locate end-of-central-directory record",
|
148
|
+
],
|
149
|
+
],
|
150
|
+
messages)
|
151
|
+
end
|
152
|
+
end
|
135
153
|
end
|
136
154
|
end
|
@@ -164,5 +164,23 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
|
|
164
164
|
decompose(fixture_path("ods", "shapes.ods")))
|
165
165
|
end
|
166
166
|
end
|
167
|
+
|
168
|
+
sub_test_case("invalid") do
|
169
|
+
def test_empty
|
170
|
+
messages = capture_log do
|
171
|
+
assert_equal([], decompose(fixture_path("ods", "empty.ods")))
|
172
|
+
end
|
173
|
+
assert_equal([
|
174
|
+
[
|
175
|
+
:error,
|
176
|
+
"[decomposer][opendocument][spreadsheet] " +
|
177
|
+
"Failed to process zip: " +
|
178
|
+
"Archive::Zip::UnzipError: " +
|
179
|
+
"unable to locate end-of-central-directory record",
|
180
|
+
],
|
181
|
+
],
|
182
|
+
messages)
|
183
|
+
end
|
184
|
+
end
|
167
185
|
end
|
168
186
|
end
|
@@ -140,5 +140,23 @@ Single quote: ''
|
|
140
140
|
BODY
|
141
141
|
end
|
142
142
|
end
|
143
|
+
|
144
|
+
sub_test_case("invalid") do
|
145
|
+
def test_empty
|
146
|
+
messages = capture_log do
|
147
|
+
assert_equal([], decompose(fixture_path("odt", "empty.odt")))
|
148
|
+
end
|
149
|
+
assert_equal([
|
150
|
+
[
|
151
|
+
:error,
|
152
|
+
"[decomposer][opendocument][text] " +
|
153
|
+
"Failed to process zip: " +
|
154
|
+
"Archive::Zip::UnzipError: " +
|
155
|
+
"unable to locate end-of-central-directory record",
|
156
|
+
],
|
157
|
+
],
|
158
|
+
messages)
|
159
|
+
end
|
160
|
+
end
|
143
161
|
end
|
144
162
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -35,12 +35,40 @@ class TestDecomposersXML < Test::Unit::TestCase
|
|
35
35
|
Hello
|
36
36
|
&
|
37
37
|
World
|
38
|
-
|
39
38
|
TEXT
|
40
39
|
assert_equal([text],
|
41
40
|
decompose(xml).collect(&:body))
|
42
41
|
end
|
43
42
|
|
43
|
+
def test_invalid_xml
|
44
|
+
messages = capture_log do
|
45
|
+
assert_equal([], decompose("<root x=/>"))
|
46
|
+
end
|
47
|
+
assert_equal([
|
48
|
+
[
|
49
|
+
:error,
|
50
|
+
"[decomposer][xml] Failed to parse XML: " +
|
51
|
+
"ChupaText::SAXParser::ParseError: ...",
|
52
|
+
],
|
53
|
+
],
|
54
|
+
messages)
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_invalid_encoding
|
58
|
+
messages = capture_log do
|
59
|
+
assert_equal([],
|
60
|
+
decompose("\x00\x05\a\xA6"))
|
61
|
+
end
|
62
|
+
assert_equal([
|
63
|
+
[
|
64
|
+
:error,
|
65
|
+
"[decomposer][xml] Failed to parse XML: " +
|
66
|
+
"ChupaText::SAXParser::ParseError: ...",
|
67
|
+
],
|
68
|
+
],
|
69
|
+
messages)
|
70
|
+
end
|
71
|
+
|
44
72
|
private
|
45
73
|
def decompose(xml)
|
46
74
|
data = ChupaText::Data.new
|
@@ -54,5 +82,16 @@ class TestDecomposersXML < Test::Unit::TestCase
|
|
54
82
|
end
|
55
83
|
decomposed
|
56
84
|
end
|
85
|
+
|
86
|
+
def capture_log
|
87
|
+
messages = super
|
88
|
+
messages.collect do |level, message|
|
89
|
+
[
|
90
|
+
level,
|
91
|
+
message.gsub(/(ChupaText::SAXParser::ParseError:) .*/,
|
92
|
+
"\\1 ...")
|
93
|
+
]
|
94
|
+
end
|
95
|
+
end
|
57
96
|
end
|
58
97
|
end
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/test/test-extractor.rb
CHANGED
@@ -228,5 +228,15 @@ class TestExtractor < Test::Unit::TestCase
|
|
228
228
|
assert_equal(["こんにちは"], extract(data))
|
229
229
|
end
|
230
230
|
end
|
231
|
+
|
232
|
+
sub_test_case("max body size") do
|
233
|
+
def test_last_invalid
|
234
|
+
@extractor = ChupaText::Extractor.new(max_body_size: 5)
|
235
|
+
data = ChupaText::Data.new
|
236
|
+
data.mime_type = "text/plain"
|
237
|
+
data.body = "こん"
|
238
|
+
assert_equal(["こ"], extract(data))
|
239
|
+
end
|
240
|
+
end
|
231
241
|
end
|
232
242
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: archive-zip
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- lib/chupa-text/screenshot.rb
|
190
190
|
- lib/chupa-text/size-parser.rb
|
191
191
|
- lib/chupa-text/text-data.rb
|
192
|
+
- lib/chupa-text/unzippable.rb
|
192
193
|
- lib/chupa-text/utf8-converter.rb
|
193
194
|
- lib/chupa-text/version.rb
|
194
195
|
- lib/chupa-text/virtual-content.rb
|
@@ -210,6 +211,7 @@ files:
|
|
210
211
|
- test/fixture/command/chupa-text/no-decomposer.conf
|
211
212
|
- test/fixture/command/chupa-text/numbers.csv
|
212
213
|
- test/fixture/docx/attributes.docx
|
214
|
+
- test/fixture/docx/empty.docx
|
213
215
|
- test/fixture/docx/multi-pages.docx
|
214
216
|
- test/fixture/docx/one-page.docx
|
215
217
|
- test/fixture/docx/special-characters.docx
|
@@ -218,24 +220,29 @@ files:
|
|
218
220
|
- test/fixture/gzip/hello.tgz
|
219
221
|
- test/fixture/gzip/hello.txt.gz
|
220
222
|
- test/fixture/odp/attributes.odp
|
223
|
+
- test/fixture/odp/empty.odp
|
221
224
|
- test/fixture/odp/multi-slides.odp
|
222
225
|
- test/fixture/odp/one-slide.odp
|
223
226
|
- test/fixture/ods/attributes.ods
|
224
227
|
- test/fixture/ods/covered-table-cell.ods
|
228
|
+
- test/fixture/ods/empty.ods
|
225
229
|
- test/fixture/ods/multi-sheets.ods
|
226
230
|
- test/fixture/ods/one-sheet.ods
|
227
231
|
- test/fixture/ods/shapes.ods
|
228
232
|
- test/fixture/odt/attributes.odt
|
233
|
+
- test/fixture/odt/empty.odt
|
229
234
|
- test/fixture/odt/multi-pages.odt
|
230
235
|
- test/fixture/odt/one-page.odt
|
231
236
|
- test/fixture/odt/special-characters.odt
|
232
237
|
- test/fixture/pptx/attributes.pptx
|
238
|
+
- test/fixture/pptx/empty.pptx
|
233
239
|
- test/fixture/pptx/multi-slides.pptx
|
234
240
|
- test/fixture/pptx/one-slide.pptx
|
235
241
|
- test/fixture/tar/directory.tar
|
236
242
|
- test/fixture/tar/top-level.tar
|
237
243
|
- test/fixture/tar/utf-8.tar
|
238
244
|
- test/fixture/xlsx/attributes.xlsx
|
245
|
+
- test/fixture/xlsx/empty.xlsx
|
239
246
|
- test/fixture/xlsx/multi-sheets.xlsx
|
240
247
|
- test/fixture/xlsx/not-shared-cell.xlsx
|
241
248
|
- test/fixture/xlsx/one-sheet.xlsx
|