chupa-text 1.1.7 → 1.1.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +20 -0
  3. data/lib/chupa-text.rb +1 -0
  4. data/lib/chupa-text/command/chupa-text.rb +8 -2
  5. data/lib/chupa-text/data.rb +14 -4
  6. data/lib/chupa-text/decomposers/gzip.rb +23 -2
  7. data/lib/chupa-text/decomposers/office-open-xml-document.rb +4 -0
  8. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +4 -0
  9. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +4 -0
  10. data/lib/chupa-text/decomposers/office-open-xml.rb +23 -24
  11. data/lib/chupa-text/decomposers/opendocument-presentation.rb +4 -0
  12. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +4 -0
  13. data/lib/chupa-text/decomposers/opendocument-text.rb +4 -0
  14. data/lib/chupa-text/decomposers/opendocument.rb +20 -17
  15. data/lib/chupa-text/decomposers/xml.rb +36 -9
  16. data/lib/chupa-text/decomposers/zip.rb +2 -20
  17. data/lib/chupa-text/extractor.rb +33 -33
  18. data/lib/chupa-text/input-data.rb +4 -0
  19. data/lib/chupa-text/sax-parser.rb +24 -2
  20. data/lib/chupa-text/unzippable.rb +39 -0
  21. data/lib/chupa-text/utf8-converter.rb +42 -24
  22. data/lib/chupa-text/version.rb +1 -1
  23. data/test/decomposers/test-gzip.rb +19 -2
  24. data/test/decomposers/test-office-open-xml-document.rb +18 -0
  25. data/test/decomposers/test-office-open-xml-presentation.rb +18 -0
  26. data/test/decomposers/test-office-open-xml-workbook.rb +18 -0
  27. data/test/decomposers/test-opendocument-presentation.rb +18 -0
  28. data/test/decomposers/test-opendocument-spreadsheet.rb +18 -0
  29. data/test/decomposers/test-opendocument-text.rb +18 -0
  30. data/test/decomposers/test-xml.rb +41 -2
  31. data/test/fixture/docx/empty.docx +0 -0
  32. data/test/fixture/odp/empty.odp +0 -0
  33. data/test/fixture/ods/empty.ods +0 -0
  34. data/test/fixture/odt/empty.odt +0 -0
  35. data/test/fixture/pptx/empty.pptx +0 -0
  36. data/test/fixture/xlsx/empty.xlsx +0 -0
  37. data/test/test-extractor.rb +10 -0
  38. metadata +9 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a8e70028837d0c6abebb4d01432f3467e8c5dd42129b9dffa3dd16a939594894
4
- data.tar.gz: 4a445af645995552c82e8339f1fd2d5f71dd8a47be669ee674ede6e635585c4f
3
+ metadata.gz: 99f896b313c052ecad3ea86ac6009c161cb2b3216b828aaec67da9dc886c5fbe
4
+ data.tar.gz: 986e35e1c19faa63652d0d8f43c9bd73e527e1d7af2593eea4525ee473414aa1
5
5
  SHA512:
6
- metadata.gz: 1052a2c7148c1bc184633b1ed365f422584bcdb4bc1952534e24f2ebf1445a0f286cdf4d47241a7aeefcfce80e4bdfb7c5bf6e9e74fba30a5a1f8d333fde62b3
7
- data.tar.gz: c2b355f73f09004214037069ffbdeafddd4e786c2e2498a905c9b651947f7508d1fc268052404e37297d2e816b7e115fa35ef959e98f22369f1e355220138b91
6
+ metadata.gz: 962932aed44748ca3c4809434d3e943ab5a113a6f57f058d29f7c60e6d1090f07d272d85221a5f62e9459f8a84f05725c7d5d33b15564d224dadb20215c155ae
7
+ data.tar.gz: 65face75476f34016b17b64cd0d843e305b87052192d944f2e60f5f06ea91b218e6d4fce1271eacf8f4de7152113fb87de86665b12b9fc479e80789dd0e428dd
@@ -1,5 +1,25 @@
1
1
  # News
2
2
 
3
+ ## 1.1.8: 2019-03-03
4
+
5
+ ### Improvements
6
+
7
+ * `gzip`: Added error checks.
8
+
9
+ * `xml`:
10
+
11
+ * Added error checks.
12
+
13
+ * Added support for Nokogiri as an alternative backend.
14
+
15
+ * Reduced memory usage.
16
+
17
+ * Added support for body size limitation.
18
+
19
+ * `opendocument`: Added error checks.
20
+
21
+ * `office-open-xml`: Added error checks.
22
+
3
23
  ## 1.1.7: 2019-03-01
4
24
 
5
25
  ### Improvements
@@ -23,6 +23,7 @@ require "chupa-text/default-logger"
23
23
  require "chupa-text/logger"
24
24
 
25
25
  require "chupa-text/loggable"
26
+ require "chupa-text/unzippable"
26
27
 
27
28
  require "chupa-text/configuration"
28
29
  require "chupa-text/configuration-loader"
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -49,6 +49,7 @@ module ChupaText
49
49
  @mime_formatter_options = {}
50
50
  @need_screenshot = true
51
51
  @expected_screenshot_size = [200, 200]
52
+ @max_body_size = nil
52
53
  end
53
54
 
54
55
  def run(*arguments)
@@ -143,6 +144,11 @@ module ChupaText
143
144
  "(default: #{@expected_screenshot_size.join("x")})") do |size|
144
145
  @expected_screenshot_size = size
145
146
  end
147
+ parser.on("--max-body-size=BYTE", Integer,
148
+ "The max byte of extracted body.",
149
+ "(default: no limit)") do |size|
150
+ @max_body_size = size
151
+ end
146
152
 
147
153
  parser.separator("")
148
154
  parser.separator("Log related options:")
@@ -190,7 +196,7 @@ module ChupaText
190
196
  end
191
197
 
192
198
  def create_extractor
193
- extractor = Extractor.new
199
+ extractor = Extractor.new(max_body_size: @max_body_size)
194
200
  extractor.apply_configuration(@configuration)
195
201
  extractor
196
202
  end
@@ -141,7 +141,9 @@ module ChupaText
141
141
  end
142
142
 
143
143
  def peek_body(size)
144
- body[0, size]
144
+ _body = body
145
+ return nil if _body.nil?
146
+ _body[0, size]
145
147
  end
146
148
 
147
149
  def [](name)
@@ -196,12 +198,20 @@ module ChupaText
196
198
  @need_screenshot
197
199
  end
198
200
 
199
- def to_utf8_body_data
200
- b = body
201
+ def to_utf8_body_data(max_body_size: nil)
202
+ b = nil
203
+ if max_body_size
204
+ open do |input|
205
+ b = input.read(max_body_size)
206
+ end
207
+ else
208
+ b = body
209
+ end
201
210
  return self if b.nil?
211
+
202
212
  converter = UTF8Converter.new(b)
203
213
  utf8_body = converter.convert
204
- if b.equal?(utf8_body)
214
+ if max_body_size.nil? and b.equal?(utf8_body)
205
215
  self
206
216
  else
207
217
  TextData.new(utf8_body, source_data: self)
@@ -19,6 +19,8 @@ require "zlib"
19
19
  module ChupaText
20
20
  module Decomposers
21
21
  class Gzip < Decomposer
22
+ include Loggable
23
+
22
24
  registry.register("gzip", self)
23
25
 
24
26
  TARGET_EXTENSIONS = ["gz", "tgz"]
@@ -33,8 +35,7 @@ module ChupaText
33
35
  end
34
36
 
35
37
  def decompose(data)
36
- data.open do |input|
37
- reader = Zlib::GzipReader.new(input)
38
+ open_reader(data) do |reader|
38
39
  uri = nil
39
40
  case data.extension
40
41
  when "gz"
@@ -46,6 +47,26 @@ module ChupaText
46
47
  yield(extracted)
47
48
  end
48
49
  end
50
+
51
+ private
52
+ def open_reader(data)
53
+ data.open do |input|
54
+ begin
55
+ yield(Zlib::GzipReader.new(input))
56
+ rescue Zlib::Error => zlib_error
57
+ error do
58
+ message = "#{log_tag} Failed to uncompress: "
59
+ message << "#{zlib_error.class}: #{zlib_error.message}\n"
60
+ message << zlib_error.backtrace.join("\n")
61
+ message
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ def log_tag
68
+ "[decomposer][gzip]"
69
+ end
49
70
  end
50
71
  end
51
72
  end
@@ -58,6 +58,10 @@ module ChupaText
58
58
  end
59
59
  yield(text_data)
60
60
  end
61
+
62
+ def log_tag
63
+ "#{super}[document]"
64
+ end
61
65
  end
62
66
  end
63
67
  end
@@ -76,6 +76,10 @@ module ChupaText
76
76
  yield(text_data)
77
77
  end
78
78
  end
79
+
80
+ def log_tag
81
+ "#{super}[presentation]"
82
+ end
79
83
  end
80
84
  end
81
85
  end
@@ -93,6 +93,10 @@ module ChupaText
93
93
  end
94
94
  end
95
95
 
96
+ def log_tag
97
+ "#{super}[workbook]"
98
+ end
99
+
96
100
  class WorkbookListener < SAXListener
97
101
  URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
98
102
 
@@ -14,13 +14,14 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "archive/zip"
18
-
19
17
  require "chupa-text/sax-parser"
20
18
 
21
19
  module ChupaText
22
20
  module Decomposers
23
21
  class OfficeOpenXML < Decomposer
22
+ include Loggable
23
+ include Unzippable
24
+
24
25
  def target?(data)
25
26
  @extensions.include?(data.extension) or
26
27
  @mime_types.include?(data.mime_type)
@@ -35,29 +36,27 @@ module ChupaText
35
36
  end
36
37
 
37
38
  def decompose(data, &block)
38
- context = {
39
- data: data,
40
- attributes: {},
41
- }
42
- start_decompose(context)
43
- data.open do |input|
44
- Archive::Zip.open(input) do |zip|
45
- zip.each do |entry|
46
- next unless entry.file?
47
- case entry.zip_path
48
- when "docProps/app.xml"
49
- listener = AttributesListener.new(context[:attributes])
50
- parse(entry.file_data, listener)
51
- when "docProps/core.xml"
52
- listener = AttributesListener.new(context[:attributes])
53
- parse(entry.file_data, listener)
54
- else
55
- process_entry(entry, context)
56
- end
39
+ unzip(data) do |zip|
40
+ context = {
41
+ data: data,
42
+ attributes: {},
43
+ }
44
+ start_decompose(context)
45
+ zip.each do |entry|
46
+ next unless entry.file?
47
+ case entry.zip_path
48
+ when "docProps/app.xml"
49
+ listener = AttributesListener.new(context[:attributes])
50
+ parse(entry.file_data, listener)
51
+ when "docProps/core.xml"
52
+ listener = AttributesListener.new(context[:attributes])
53
+ parse(entry.file_data, listener)
54
+ else
55
+ process_entry(entry, context)
57
56
  end
58
57
  end
58
+ finish_decompose(context, &block)
59
59
  end
60
- finish_decompose(context, &block)
61
60
  end
62
61
 
63
62
  private
@@ -71,8 +70,8 @@ module ChupaText
71
70
  parse(entry.file_data, listener)
72
71
  end
73
72
 
74
- def accumulate_text(context)
75
- context[:text]
73
+ def log_tag
74
+ "[decomposer][office-open-xml]"
76
75
  end
77
76
 
78
77
  class TextListener < SAXListener
@@ -49,6 +49,10 @@ module ChupaText
49
49
  end
50
50
  end
51
51
 
52
+ def log_tag
53
+ "#{super}[presentation]"
54
+ end
55
+
52
56
  class SlidesListener < SAXListener
53
57
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
54
58
  DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
@@ -51,6 +51,10 @@ module ChupaText
51
51
  end
52
52
  end
53
53
 
54
+ def log_tag
55
+ "#{super}[spreadsheet]"
56
+ end
57
+
54
58
  class SheetsListener < SAXListener
55
59
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
56
60
  TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
@@ -43,6 +43,10 @@ module ChupaText
43
43
  yield(text_data)
44
44
  end
45
45
 
46
+ def log_tag
47
+ "#{super}[text]"
48
+ end
49
+
46
50
  class TextListener < SAXListener
47
51
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
48
52
  def initialize(output)
@@ -14,13 +14,14 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "archive/zip"
18
-
19
17
  require "chupa-text/sax-parser"
20
18
 
21
19
  module ChupaText
22
20
  module Decomposers
23
21
  class OpenDocument < Decomposer
22
+ include Loggable
23
+ include Unzippable
24
+
24
25
  def target?(data)
25
26
  data.extension == @extension or
26
27
  data.mime_type == @mime_type
@@ -35,24 +36,22 @@ module ChupaText
35
36
  end
36
37
 
37
38
  def decompose(data, &block)
38
- context = {
39
- data: data,
40
- attributes: {},
41
- }
42
- data.open do |input|
43
- Archive::Zip.open(input) do |zip|
44
- zip.each do |entry|
45
- next unless entry.file?
46
- case entry.zip_path
47
- when "content.xml"
48
- process_content(entry, context, &block)
49
- when "meta.xml"
50
- process_meta(entry, context, &block)
51
- end
39
+ unzip(data) do |zip|
40
+ context = {
41
+ data: data,
42
+ attributes: {},
43
+ }
44
+ zip.each do |entry|
45
+ next unless entry.file?
46
+ case entry.zip_path
47
+ when "content.xml"
48
+ process_content(entry, context, &block)
49
+ when "meta.xml"
50
+ process_meta(entry, context, &block)
52
51
  end
53
52
  end
53
+ finish_decompose(context, &block)
54
54
  end
55
- finish_decompose(context, &block)
56
55
  end
57
56
 
58
57
  private
@@ -66,6 +65,10 @@ module ChupaText
66
65
  parse(entry.file_data, listener)
67
66
  end
68
67
 
68
+ def log_tag
69
+ "[decomposer][opendocument]"
70
+ end
71
+
69
72
  class AttributesListener < SAXListener
70
73
  META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
71
74
  DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -14,12 +14,13 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "rexml/document"
18
- require "rexml/streamlistener"
17
+ require "chupa-text/sax-parser"
19
18
 
20
19
  module ChupaText
21
20
  module Decomposers
22
21
  class XML < Decomposer
22
+ include Loggable
23
+
23
24
  registry.register("xml", self)
24
25
 
25
26
  def target?(data)
@@ -31,22 +32,48 @@ module ChupaText
31
32
  text = ""
32
33
  listener = Listener.new(text)
33
34
  data.open do |input|
34
- parser = REXML::Parsers::StreamParser.new(input, listener)
35
- parser.parse
35
+ begin
36
+ parser = SAXParser.new(input, listener)
37
+ parser.parse
38
+ rescue SAXParser::ParseError => xml_error
39
+ error do
40
+ message = "#{log_tag} Failed to parse XML: "
41
+ message << "#{xml_error.class}: #{xml_error.message}\n"
42
+ message << xml_error.backtrace.join("\n")
43
+ message
44
+ end
45
+ return
46
+ end
36
47
  end
37
48
  text_data = TextData.new(text, :source_data => data)
38
49
  yield(text_data)
39
50
  end
40
51
 
41
- class Listener
42
- include REXML::StreamListener
52
+ private
53
+ def log_tag
54
+ "[decomposer][xml]"
55
+ end
43
56
 
57
+ class Listener < SAXListener
44
58
  def initialize(output)
45
59
  @output = output
60
+ @level = 0
61
+ end
62
+
63
+ def start_element(*args)
64
+ @level += 1
65
+ end
66
+
67
+ def end_element(*args)
68
+ @level -= 1
69
+ end
70
+
71
+ def characters(text)
72
+ @output << text if @level > 0
46
73
  end
47
74
 
48
- def text(text)
49
- @output << text
75
+ def cdata(content)
76
+ @output << content if @level > 0
50
77
  end
51
78
  end
52
79
  end
@@ -14,14 +14,13 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "archive/zip"
18
-
19
17
  require "chupa-text/path-converter"
20
18
 
21
19
  module ChupaText
22
20
  module Decomposers
23
21
  class Zip < Decomposer
24
22
  include Loggable
23
+ include Unzippable
25
24
 
26
25
  registry.register("zip", self)
27
26
 
@@ -33,7 +32,7 @@ module ChupaText
33
32
  end
34
33
 
35
34
  def decompose(data)
36
- open_zip(data) do |zip|
35
+ unzip(data) do |zip|
37
36
  zip.each do |entry|
38
37
  next unless entry.file?
39
38
 
@@ -68,23 +67,6 @@ module ChupaText
68
67
  end
69
68
 
70
69
  private
71
- def open_zip(data)
72
- begin
73
- data.open do |input|
74
- Archive::Zip.open(input) do |zip|
75
- yield(zip)
76
- end
77
- end
78
- rescue Archive::Zip::Error => zip_error
79
- error do
80
- message = "#{log_tag} Failed to process zip: "
81
- message << "#{zip_error.class}: #{zip_error.message}\n"
82
- message << zip_error.backtrace.join("\n")
83
- message
84
- end
85
- end
86
- end
87
-
88
70
  def log_tag
89
71
  "[decomposer][zip]"
90
72
  end
@@ -21,8 +21,9 @@ module ChupaText
21
21
  class Extractor
22
22
  include Loggable
23
23
 
24
- def initialize
24
+ def initialize(max_body_size: nil)
25
25
  @decomposers = []
26
+ @max_body_size = max_body_size
26
27
  end
27
28
 
28
29
  # Sets the extractor up by the configuration. It adds decomposers
@@ -57,38 +58,8 @@ module ChupaText
57
58
  # You can get text data by `text_data.body`.
58
59
  #
59
60
  # @return [void]
60
- def extract(input)
61
- targets = [ensure_data(input)]
62
- until targets.empty?
63
- target = targets.shift
64
- debug do
65
- "#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>"
66
- end
67
- decomposer = find_decomposer(target)
68
- if decomposer.nil?
69
- if target.text_plain?
70
- debug {"#{log_tag}[extract][text-plain]"}
71
- yield(target.to_utf8_body_data)
72
- next
73
- else
74
- debug {"#{log_tag}[extract][decomposer] not found"}
75
- if target.text?
76
- yield(target.to_utf8_body_data)
77
- end
78
- next
79
- end
80
- end
81
- debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
82
- decomposer.decompose(target) do |decomposed|
83
- debug do
84
- "#{log_tag}[extract][decomposed] " +
85
- "#{decomposer.class}: " +
86
- "<#{target.uri}>: " +
87
- "<#{target.mime_type}> -> <#{decomposed.mime_type}>"
88
- end
89
- targets.push(decomposed)
90
- end
91
- end
61
+ def extract(input, &block)
62
+ extract_recursive(ensure_data(input), &block)
92
63
  end
93
64
 
94
65
  private
@@ -112,6 +83,35 @@ module ChupaText
112
83
  candidate[1]
113
84
  end
114
85
 
86
+ def extract_recursive(target, &block)
87
+ debug do
88
+ "#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>"
89
+ end
90
+ decomposer = find_decomposer(target)
91
+ if decomposer.nil?
92
+ if target.text_plain?
93
+ debug {"#{log_tag}[extract][text-plain]"}
94
+ yield(target.to_utf8_body_data(max_body_size: @max_body_size))
95
+ else
96
+ debug {"#{log_tag}[extract][decomposer] not found"}
97
+ if target.text?
98
+ yield(target.to_utf8_body_data(max_body_size: @max_body_size))
99
+ end
100
+ end
101
+ else
102
+ debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
103
+ decomposer.decompose(target) do |decomposed|
104
+ debug do
105
+ "#{log_tag}[extract][decomposed] " +
106
+ "#{decomposer.class}: " +
107
+ "<#{target.uri}>: " +
108
+ "<#{target.mime_type}> -> <#{decomposed.mime_type}>"
109
+ end
110
+ extract_recursive(decomposed, &block)
111
+ end
112
+ end
113
+ end
114
+
115
115
  def log_tag
116
116
  "[extractor]"
117
117
  end
@@ -36,6 +36,10 @@ module ChupaText
36
36
  @content.body
37
37
  end
38
38
 
39
+ def peek_body(size)
40
+ @content.peek_body(size)
41
+ end
42
+
39
43
  def size
40
44
  @content.size
41
45
  end
@@ -25,6 +25,9 @@ end
25
25
 
26
26
  module ChupaText
27
27
  class SAXParser
28
+ class ParseError < Error
29
+ end
30
+
28
31
  class << self
29
32
  def backend
30
33
  case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
@@ -94,6 +97,10 @@ module ChupaText
94
97
  @listener.cdata(content)
95
98
  end
96
99
 
100
+ def error(detail)
101
+ raise ParseError, detail
102
+ end
103
+
97
104
  private
98
105
  def build_qname(prefix, local_name)
99
106
  if prefix
@@ -105,10 +112,25 @@ module ChupaText
105
112
  end
106
113
  else
107
114
  def parse
108
- source = REXML::Source.new(@input.read)
115
+ source = @input
116
+ if source.is_a?(Archive::Zip::Codec::Deflate::Decompress)
117
+ source = source.read
118
+ end
109
119
  parser = REXML::Parsers::SAX2Parser.new(source)
110
120
  parser.listen(Listener.new(@listener))
111
- parser.parse
121
+ begin
122
+ parser.parse
123
+ rescue REXML::ParseException => error
124
+ message = "#{error.class}: #{error.message}"
125
+ raise ParseError, message
126
+ rescue ArgumentError => error
127
+ if error.message.start_with?("invalid byte sequence")
128
+ message = "#{error.class}: #{error.message}"
129
+ raise ParseError, message
130
+ else
131
+ raise
132
+ end
133
+ end
112
134
  end
113
135
 
114
136
  class Listener
@@ -0,0 +1,39 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "archive/zip"
18
+
19
+ module ChupaText
20
+ module Unzippable
21
+ private
22
+ def unzip(data)
23
+ data.open do |input|
24
+ begin
25
+ Archive::Zip.open(input) do |zip|
26
+ yield(zip)
27
+ end
28
+ rescue Archive::Zip::Error => zip_error
29
+ error do
30
+ message = "#{log_tag} Failed to process zip: "
31
+ message << "#{zip_error.class}: #{zip_error.message}\n"
32
+ message << zip_error.backtrace.join("\n")
33
+ message
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -16,8 +16,9 @@
16
16
 
17
17
  module ChupaText
18
18
  class UTF8Converter
19
- def initialize(string)
19
+ def initialize(string, max_size: nil)
20
20
  @string = string
21
+ @max_size = max_size
21
22
  end
22
23
 
23
24
  def convert
@@ -26,44 +27,51 @@ module ChupaText
26
27
  when Encoding::UTF_8
27
28
  bom_size, bom_encoding = detect_bom
28
29
  if bom_size
29
- return @string.byteslice(bom_size,
30
- @string.bytesize - bom_size)
30
+ utf8_string = @string.byteslice(bom_size,
31
+ @string.bytesize - bom_size)
31
32
  else
32
- return @string
33
+ utf8_string = @string
33
34
  end
35
+ return truncate(utf8_string)
34
36
  when Encoding::ASCII_8BIT
35
- return @string if @string.ascii_only?
37
+ return truncate(@string) if @string.ascii_only?
36
38
  else
37
- return @string.encode(Encoding::UTF_8,
38
- invalid: :replace,
39
- undef: :replace,
40
- replace: "")
39
+ utf8_string = @string.encode(Encoding::UTF_8,
40
+ invalid: :replace,
41
+ undef: :replace,
42
+ replace: "")
43
+ return truncate(utf8_string)
41
44
  end
42
45
 
43
46
  bom_size, bom_encoding = detect_bom
44
47
  if bom_encoding
45
48
  string_without_bom = @string.byteslice(bom_size,
46
49
  @string.bytesize - bom_size)
47
- return string_without_bom.encode(Encoding::UTF_8,
48
- bom_encoding,
49
- invalid: :replace,
50
- undef: :replace,
51
- replace: "")
50
+ utf8_string = string_without_bom.encode(Encoding::UTF_8,
51
+ bom_encoding,
52
+ invalid: :replace,
53
+ undef: :replace,
54
+ replace: "")
55
+ return truncate(utf8_string)
52
56
  end
53
57
 
54
58
  guessed_encoding = guess_encoding
55
59
  if guessed_encoding
56
- @string.encode(Encoding::UTF_8,
57
- guessed_encoding,
58
- invalid: :replace,
59
- undef: :replace,
60
- replace: "")
60
+ truncate(@string.encode(Encoding::UTF_8,
61
+ guessed_encoding,
62
+ invalid: :replace,
63
+ undef: :replace,
64
+ replace: ""))
61
65
  else
62
- utf8_body = @string.dup
63
- utf8_body.force_encoding(Encoding::UTF_8)
64
- utf8_body.scrub!("")
65
- utf8_body.gsub!(/\p{Control}+/, "")
66
- utf8_body
66
+ if @max_size
67
+ utf8_string = @string.byteslice(0, @max_size)
68
+ else
69
+ utf8_string = @string.dup
70
+ end
71
+ utf8_string.force_encoding(Encoding::UTF_8)
72
+ utf8_string.scrub!("")
73
+ utf8_string.gsub!(/\p{Control}+/, "")
74
+ utf8_string
67
75
  end
68
76
  end
69
77
 
@@ -113,5 +121,15 @@ module ChupaText
113
121
  @string.force_encoding(original_encoding)
114
122
  end
115
123
  end
124
+
125
+ def truncate(string)
126
+ if @max_size and string.bytesize > @max_size
127
+ truncated = string.byteslice(0, @max_size)
128
+ truncated.scrub!("")
129
+ truncated
130
+ else
131
+ string
132
+ end
133
+ end
116
134
  end
117
135
  end
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.1.7"
18
+ VERSION = "1.1.8"
19
19
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -83,7 +83,6 @@ class TestDecomposersGzip < Test::Unit::TestCase
83
83
  end
84
84
  end
85
85
 
86
-
87
86
  sub_test_case("tgz") do
88
87
  def setup
89
88
  super
@@ -109,5 +108,23 @@ class TestDecomposersGzip < Test::Unit::TestCase
109
108
  decompose(@data).collect(&:source))
110
109
  end
111
110
  end
111
+
112
+ def test_invalid
113
+ data = ChupaText::Data.new
114
+ data.body = "Hello"
115
+ data.size = data.body.bytesize
116
+ data.mime_type = "application/gzip"
117
+ messages = capture_log do
118
+ assert_equal([], decompose(data).collect(&:body))
119
+ end
120
+ assert_equal([
121
+ [
122
+ :error,
123
+ "[decomposer][gzip] Failed to uncompress: " +
124
+ "Zlib::GzipFile::Error: not in gzip format",
125
+ ],
126
+ ],
127
+ messages)
128
+ end
112
129
  end
113
130
  end
@@ -140,5 +140,23 @@ Single quote: ''
140
140
  BODY
141
141
  end
142
142
  end
143
+
144
+ sub_test_case("invalid") do
145
+ def test_empty
146
+ messages = capture_log do
147
+ assert_equal([], decompose(fixture_path("docx", "empty.docx")))
148
+ end
149
+ assert_equal([
150
+ [
151
+ :error,
152
+ "[decomposer][office-open-xml][document] " +
153
+ "Failed to process zip: " +
154
+ "Archive::Zip::UnzipError: " +
155
+ "unable to locate end-of-central-directory record",
156
+ ],
157
+ ],
158
+ messages)
159
+ end
160
+ end
143
161
  end
144
162
  end
@@ -129,5 +129,23 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
129
129
  decompose(fixture_path("pptx", "multi-slides.pptx")))
130
130
  end
131
131
  end
132
+
133
+ sub_test_case("invalid") do
134
+ def test_empty
135
+ messages = capture_log do
136
+ assert_equal([], decompose(fixture_path("pptx", "empty.pptx")))
137
+ end
138
+ assert_equal([
139
+ [
140
+ :error,
141
+ "[decomposer][office-open-xml][presentation] " +
142
+ "Failed to process zip: " +
143
+ "Archive::Zip::UnzipError: " +
144
+ "unable to locate end-of-central-directory record",
145
+ ],
146
+ ],
147
+ messages)
148
+ end
149
+ end
132
150
  end
133
151
  end
@@ -152,5 +152,23 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
152
152
  decompose(fixture_path("xlsx", "multi-sheets.xlsx")))
153
153
  end
154
154
  end
155
+
156
+ sub_test_case("invalid") do
157
+ def test_empty
158
+ messages = capture_log do
159
+ assert_equal([], decompose(fixture_path("xlsx", "empty.xlsx")))
160
+ end
161
+ assert_equal([
162
+ [
163
+ :error,
164
+ "[decomposer][office-open-xml][workbook] " +
165
+ "Failed to process zip: " +
166
+ "Archive::Zip::UnzipError: " +
167
+ "unable to locate end-of-central-directory record",
168
+ ],
169
+ ],
170
+ messages)
171
+ end
172
+ end
155
173
  end
156
174
  end
@@ -132,5 +132,23 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase
132
132
  decompose.collect {|data| [data["index"], data.body]})
133
133
  end
134
134
  end
135
+
136
+ sub_test_case("invalid") do
137
+ def test_empty
138
+ messages = capture_log do
139
+ assert_equal([], decompose(fixture_path("odp", "empty.odp")))
140
+ end
141
+ assert_equal([
142
+ [
143
+ :error,
144
+ "[decomposer][opendocument][presentation] " +
145
+ "Failed to process zip: " +
146
+ "Archive::Zip::UnzipError: " +
147
+ "unable to locate end-of-central-directory record",
148
+ ],
149
+ ],
150
+ messages)
151
+ end
152
+ end
135
153
  end
136
154
  end
@@ -164,5 +164,23 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
164
164
  decompose(fixture_path("ods", "shapes.ods")))
165
165
  end
166
166
  end
167
+
168
+ sub_test_case("invalid") do
169
+ def test_empty
170
+ messages = capture_log do
171
+ assert_equal([], decompose(fixture_path("ods", "empty.ods")))
172
+ end
173
+ assert_equal([
174
+ [
175
+ :error,
176
+ "[decomposer][opendocument][spreadsheet] " +
177
+ "Failed to process zip: " +
178
+ "Archive::Zip::UnzipError: " +
179
+ "unable to locate end-of-central-directory record",
180
+ ],
181
+ ],
182
+ messages)
183
+ end
184
+ end
167
185
  end
168
186
  end
@@ -140,5 +140,23 @@ Single quote: ''
140
140
  BODY
141
141
  end
142
142
  end
143
+
144
+ sub_test_case("invalid") do
145
+ def test_empty
146
+ messages = capture_log do
147
+ assert_equal([], decompose(fixture_path("odt", "empty.odt")))
148
+ end
149
+ assert_equal([
150
+ [
151
+ :error,
152
+ "[decomposer][opendocument][text] " +
153
+ "Failed to process zip: " +
154
+ "Archive::Zip::UnzipError: " +
155
+ "unable to locate end-of-central-directory record",
156
+ ],
157
+ ],
158
+ messages)
159
+ end
160
+ end
143
161
  end
144
162
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -35,12 +35,40 @@ class TestDecomposersXML < Test::Unit::TestCase
35
35
  Hello
36
36
  &
37
37
  World
38
-
39
38
  TEXT
40
39
  assert_equal([text],
41
40
  decompose(xml).collect(&:body))
42
41
  end
43
42
 
43
+ def test_invalid_xml
44
+ messages = capture_log do
45
+ assert_equal([], decompose("<root x=/>"))
46
+ end
47
+ assert_equal([
48
+ [
49
+ :error,
50
+ "[decomposer][xml] Failed to parse XML: " +
51
+ "ChupaText::SAXParser::ParseError: ...",
52
+ ],
53
+ ],
54
+ messages)
55
+ end
56
+
57
+ def test_invalid_encoding
58
+ messages = capture_log do
59
+ assert_equal([],
60
+ decompose("\x00\x05\a\xA6"))
61
+ end
62
+ assert_equal([
63
+ [
64
+ :error,
65
+ "[decomposer][xml] Failed to parse XML: " +
66
+ "ChupaText::SAXParser::ParseError: ...",
67
+ ],
68
+ ],
69
+ messages)
70
+ end
71
+
44
72
  private
45
73
  def decompose(xml)
46
74
  data = ChupaText::Data.new
@@ -54,5 +82,16 @@ class TestDecomposersXML < Test::Unit::TestCase
54
82
  end
55
83
  decomposed
56
84
  end
85
+
86
+ def capture_log
87
+ messages = super
88
+ messages.collect do |level, message|
89
+ [
90
+ level,
91
+ message.gsub(/(ChupaText::SAXParser::ParseError:) .*/,
92
+ "\\1 ...")
93
+ ]
94
+ end
95
+ end
57
96
  end
58
97
  end
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -228,5 +228,15 @@ class TestExtractor < Test::Unit::TestCase
228
228
  assert_equal(["こんにちは"], extract(data))
229
229
  end
230
230
  end
231
+
232
+ sub_test_case("max body size") do
233
+ def test_last_invalid
234
+ @extractor = ChupaText::Extractor.new(max_body_size: 5)
235
+ data = ChupaText::Data.new
236
+ data.mime_type = "text/plain"
237
+ data.body = "こん"
238
+ assert_equal(["こ"], extract(data))
239
+ end
240
+ end
231
241
  end
232
242
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.7
4
+ version: 1.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-03-01 00:00:00.000000000 Z
11
+ date: 2019-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: archive-zip
@@ -189,6 +189,7 @@ files:
189
189
  - lib/chupa-text/screenshot.rb
190
190
  - lib/chupa-text/size-parser.rb
191
191
  - lib/chupa-text/text-data.rb
192
+ - lib/chupa-text/unzippable.rb
192
193
  - lib/chupa-text/utf8-converter.rb
193
194
  - lib/chupa-text/version.rb
194
195
  - lib/chupa-text/virtual-content.rb
@@ -210,6 +211,7 @@ files:
210
211
  - test/fixture/command/chupa-text/no-decomposer.conf
211
212
  - test/fixture/command/chupa-text/numbers.csv
212
213
  - test/fixture/docx/attributes.docx
214
+ - test/fixture/docx/empty.docx
213
215
  - test/fixture/docx/multi-pages.docx
214
216
  - test/fixture/docx/one-page.docx
215
217
  - test/fixture/docx/special-characters.docx
@@ -218,24 +220,29 @@ files:
218
220
  - test/fixture/gzip/hello.tgz
219
221
  - test/fixture/gzip/hello.txt.gz
220
222
  - test/fixture/odp/attributes.odp
223
+ - test/fixture/odp/empty.odp
221
224
  - test/fixture/odp/multi-slides.odp
222
225
  - test/fixture/odp/one-slide.odp
223
226
  - test/fixture/ods/attributes.ods
224
227
  - test/fixture/ods/covered-table-cell.ods
228
+ - test/fixture/ods/empty.ods
225
229
  - test/fixture/ods/multi-sheets.ods
226
230
  - test/fixture/ods/one-sheet.ods
227
231
  - test/fixture/ods/shapes.ods
228
232
  - test/fixture/odt/attributes.odt
233
+ - test/fixture/odt/empty.odt
229
234
  - test/fixture/odt/multi-pages.odt
230
235
  - test/fixture/odt/one-page.odt
231
236
  - test/fixture/odt/special-characters.odt
232
237
  - test/fixture/pptx/attributes.pptx
238
+ - test/fixture/pptx/empty.pptx
233
239
  - test/fixture/pptx/multi-slides.pptx
234
240
  - test/fixture/pptx/one-slide.pptx
235
241
  - test/fixture/tar/directory.tar
236
242
  - test/fixture/tar/top-level.tar
237
243
  - test/fixture/tar/utf-8.tar
238
244
  - test/fixture/xlsx/attributes.xlsx
245
+ - test/fixture/xlsx/empty.xlsx
239
246
  - test/fixture/xlsx/multi-sheets.xlsx
240
247
  - test/fixture/xlsx/not-shared-cell.xlsx
241
248
  - test/fixture/xlsx/one-sheet.xlsx