chupa-text 1.1.5 → 1.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/chupa-text.gemspec +2 -1
  3. data/doc/text/news.md +42 -0
  4. data/lib/chupa-text/data.rb +19 -2
  5. data/lib/chupa-text/decomposers/csv.rb +20 -4
  6. data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
  8. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
  9. data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
  11. data/lib/chupa-text/decomposers/tar.rb +18 -12
  12. data/lib/chupa-text/decomposers/zip.rb +30 -4
  13. data/lib/chupa-text/extractor.rb +5 -3
  14. data/lib/chupa-text/path-converter.rb +70 -0
  15. data/lib/chupa-text/utf8-converter.rb +117 -0
  16. data/lib/chupa-text/version.rb +1 -1
  17. data/test/command/test-chupa-text.rb +4 -4
  18. data/test/decomposers/test-csv.rb +18 -3
  19. data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
  20. data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
  21. data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
  22. data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
  23. data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
  24. data/test/decomposers/test-tar.rb +18 -1
  25. data/test/decomposers/test-zip.rb +31 -1
  26. data/test/fixture/ods/covered-table-cell.ods +0 -0
  27. data/test/fixture/ods/shapes.ods +0 -0
  28. data/test/fixture/tar/utf-8.tar +0 -0
  29. data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
  30. data/test/fixture/zip/cp932.zip +0 -0
  31. data/test/fixture/zip/utf-8.zip +0 -0
  32. data/test/helper.rb +31 -1
  33. data/test/test-data.rb +7 -3
  34. data/test/test-extractor.rb +108 -1
  35. metadata +29 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 65f2eb5be135f3db4a0cbb81bc4a268bcb752934da6d9aa32740842bb4cc674f
4
- data.tar.gz: 72f8487dc57b67293e001621f6b63ece4680cf1069d9f98e3daa89708571766b
3
+ metadata.gz: 8a248b3b82aeb4ce65bd7b498dc2aab9ad6e43e6d5ea70a304163362c3e723e7
4
+ data.tar.gz: 82f3f02b63235924cab8608f1fde8d4ecbe4cb0e1aa92f12bf9ecff1debab21d
5
5
  SHA512:
6
- metadata.gz: 32149617d9c921de856aa9fb3856a8a29fde5e9e1872a16241235499b1b8e014941928fb040ad60d52c15cbc4b69d84a9860bd432074964865e7f1e85d673a1e
7
- data.tar.gz: 85c2820468547c9b34fef27faa2db0fc9d8208a266949177b2e4f2269bfadcd8f5b534c0cdb4aff9e28360c1509dd6b2d06ef72c7d1e268f399e97595a133a2c
6
+ metadata.gz: f2c3acee194db640b3b3e01a913cfc00727ed7b8e0e4b353f5fbb2d1723f689846164d44f2ffcf1874b2948ec3f3b578dacd49d27934693ad71f24b9901373a2
7
+ data.tar.gz: 12bd3c5ed94f85bad888e56a62ef478a606e8ce5fe8f376f7c2418924ef91094b1a6c150c60fd80278a7541bb5e177b7d9cb4b7ec5b67a482b8ff74c67953ba2
data/chupa-text.gemspec CHANGED
@@ -50,7 +50,8 @@ Gem::Specification.new do |spec|
50
50
  spec.executables = Dir.glob("*")
51
51
  end
52
52
 
53
- spec.add_runtime_dependency("archive-zip")
53
+ spec.add_runtime_dependency("archive-zip", ">= 0.12.0")
54
+ spec.add_runtime_dependency("csv", ">= 3.0.4")
54
55
 
55
56
  spec.add_development_dependency("bundler")
56
57
  spec.add_development_dependency("nokogiri")
data/doc/text/news.md CHANGED
@@ -1,5 +1,47 @@
1
1
  # News
2
2
 
3
+ ## 1.1.6: 2019-03-01
4
+
5
+ ### Improvements
6
+
7
+ * `zip`:
8
+
9
+ * Added support for multibyte path.
10
+
11
+ * Added error check.
12
+
13
+ * `tar`:
14
+
15
+ * Added support for multibyte path.
16
+
17
+ * Reduced memory usage.
18
+
19
+ * Changed to the extracted text encoding to UTF-8.
20
+
21
+ * Added support BOM detection.
22
+
23
+ * Improved binary data detection.
24
+
25
+ * `office-open-xml-workbook`:
26
+
27
+ * Added support for not shared string cell values.
28
+
29
+ * Changed to emit data per sheet.
30
+
31
+ * `office-open-xml-presentation`:
32
+
33
+ * Changed to emit data per slide.
34
+
35
+ * `csv`:
36
+
37
+ * Added error check.
38
+
39
+ * `opendocument-spreadsheet`:
40
+
41
+ * Added support for concatenated cell.
42
+
43
+ * Added support for shapes.
44
+
3
45
  ## 1.1.5: 2019-02-28
4
46
 
5
47
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -18,6 +18,8 @@ require "cgi/util"
18
18
  require "uri"
19
19
  require "open-uri"
20
20
 
21
+ require "chupa-text/utf8-converter"
22
+
21
23
  module ChupaText
22
24
  class Data
23
25
  # @return [URI, nil] The URI of the data if the data is for remote
@@ -190,6 +192,18 @@ module ChupaText
190
192
  @need_screenshot
191
193
  end
192
194
 
195
+ def to_utf8_body_data
196
+ b = body
197
+ return self if b.nil?
198
+ converter = UTF8Converter.new(b)
199
+ utf8_body = converter.convert
200
+ if b.equal?(utf8_body)
201
+ self
202
+ else
203
+ TextData.new(utf8_body, source_data: self)
204
+ end
205
+ end
206
+
193
207
  private
194
208
  def guess_mime_type
195
209
  guess_mime_type_from_uri or
@@ -203,7 +217,10 @@ module ChupaText
203
217
  def guess_mime_type_from_body
204
218
  mime_type = nil
205
219
  change_encoding(body, "UTF-8") do |utf8_body|
206
- mime_type = "text/plain" if utf8_body.valid_encoding?
220
+ return nil unless utf8_body.valid_encoding?
221
+ n_null_characters = utf8_body.count("\u0000")
222
+ return nil if n_null_characters > (utf8_body.bytesize * 0.01)
223
+ mime_type = "text/plain"
207
224
  end
208
225
  mime_type
209
226
  end
@@ -20,6 +20,8 @@ require "csv"
20
20
  module ChupaText
21
21
  module Decomposers
22
22
  class CSV < Decomposer
23
+ include Loggable
24
+
23
25
  registry.register("csv", self)
24
26
 
25
27
  def target?(data)
@@ -36,10 +38,20 @@ module ChupaText
36
38
  def decompose(data)
37
39
  text = ""
38
40
  data.open do |input|
39
- csv = ::CSV.new(input)
40
- csv.each do |row|
41
- text << row.join(" ")
42
- text << "\n"
41
+ begin
42
+ csv = ::CSV.new(input, liberal_parsing: true)
43
+ csv.each do |row|
44
+ text << row.join("\t")
45
+ text << "\n"
46
+ end
47
+ rescue ::CSV::MalformedCSVError => csv_error
48
+ error do
49
+ message = "#{log_tag} Failed to parse CSV: "
50
+ message << "#{csv_error.class}: #{csv_error.message}\n"
51
+ message << csv_error.backtrace.join("\n")
52
+ message
53
+ end
54
+ return
43
55
  end
44
56
  end
45
57
 
@@ -78,6 +90,10 @@ module ChupaText
78
90
  SVG
79
91
  Screenshot.new(mime_type, data)
80
92
  end
93
+
94
+ def log_tag
95
+ "[decomposer][csv]"
96
+ end
81
97
  end
82
98
  end
83
99
  end
@@ -40,12 +40,24 @@ module ChupaText
40
40
  end
41
41
 
42
42
  private
43
+ def start_decompose(context)
44
+ context[:text] = ""
45
+ end
46
+
43
47
  def process_entry(entry, context)
44
48
  case entry.zip_path
45
49
  when "word/document.xml"
46
50
  extract_text(entry, context[:text])
47
51
  end
48
52
  end
53
+
54
+ def finish_decompose(context, &block)
55
+ text_data = TextData.new(context[:text], source_data: context[:data])
56
+ context[:attributes].each do |name, value|
57
+ text_data[name] = value
58
+ end
59
+ yield(text_data)
60
+ end
49
61
  end
50
62
  end
51
63
  end
@@ -48,19 +48,33 @@ module ChupaText
48
48
  end
49
49
 
50
50
  private
51
+ def start_decompose(context)
52
+ context[:slides] = []
53
+ end
54
+
51
55
  def process_entry(entry, context)
52
56
  case entry.zip_path
53
57
  when /\Appt\/slides\/slide(\d+)\.xml/
54
58
  nth_slide = Integer($1, 10)
55
59
  slide_text = ""
56
60
  extract_text(entry, slide_text)
57
- context[:slides] ||= []
58
61
  context[:slides] << [nth_slide, slide_text]
59
62
  end
60
63
  end
61
64
 
62
- def accumulate_text(context)
63
- context[:slides].sort_by(&:first).collect(&:last).join("\n")
65
+ def finish_decompose(context, &block)
66
+ metadata = TextData.new("", source_data: context[:data])
67
+ context[:attributes].each do |name, value|
68
+ metadata[name] = value
69
+ end
70
+ yield(metadata)
71
+
72
+ slide_texts = context[:slides].sort_by(&:first).collect(&:last)
73
+ slide_texts.each_with_index do |slide_text, i|
74
+ text_data = TextData.new(slide_text, source_data: context[:data])
75
+ text_data["index"] = i
76
+ yield(text_data)
77
+ end
64
78
  end
65
79
  end
66
80
  end
@@ -40,35 +40,73 @@ module ChupaText
40
40
  end
41
41
 
42
42
  private
43
+ def start_decompose(context)
44
+ context[:shared_strings] = []
45
+ context[:sheet_names] = []
46
+ context[:sheets] = []
47
+ end
48
+
43
49
  def process_entry(entry, context)
44
50
  case entry.zip_path
45
51
  when "xl/sharedStrings.xml"
46
- context[:shared_strings] = []
47
52
  extract_text(entry, context[:shared_strings])
53
+ when "xl/workbook.xml"
54
+ listener = WorkbookListener.new(context[:sheet_names])
55
+ parse(entry.file_data, listener)
48
56
  when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
49
57
  nth_sheet = Integer($1, 10)
50
58
  sheet = []
51
59
  listener = SheetListener.new(sheet)
52
60
  parse(entry.file_data, listener)
53
- context[:sheets] ||= []
54
61
  context[:sheets] << [nth_sheet, sheet]
55
62
  end
56
63
  end
57
64
 
58
- def accumulate_text(context)
65
+ def finish_decompose(context, &block)
66
+ metadata = TextData.new("", source_data: context[:data])
67
+ context[:attributes].each do |name, value|
68
+ metadata[name] = value
69
+ end
70
+ yield(metadata)
71
+
59
72
  shared_strings = context[:shared_strings]
60
73
  sheets = context[:sheets].sort_by(&:first).collect(&:last)
61
- sheet_texts = sheets.collect do |sheet|
74
+ sheet_names = context[:sheet_names]
75
+ sheets.each_with_index do |sheet, i|
62
76
  sheet_text = ""
63
77
  sheet.each do |row|
64
- row_texts = row.collect do |index|
65
- shared_strings[index]
78
+ row_texts = row.collect do |cell|
79
+ case cell
80
+ when Integer
81
+ shared_strings[cell]
82
+ else
83
+ cell
84
+ end
66
85
  end
67
86
  sheet_text << row_texts.join("\t") << "\n"
68
87
  end
69
- sheet_text
88
+ text_data = TextData.new(sheet_text, source_data: context[:data])
89
+ text_data["index"] = i
90
+ name = sheet_names[i]
91
+ text_data["name"] = name if name
92
+ yield(text_data)
93
+ end
94
+ end
95
+
96
+ class WorkbookListener < SAXListener
97
+ URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
98
+
99
+ def initialize(sheet_names)
100
+ @sheet_names = sheet_names
101
+ end
102
+
103
+ def start_element(uri, local_name, qname, attributes)
104
+ return unless uri == URI
105
+ case local_name
106
+ when "sheet"
107
+ @sheet_names << attributes["name"]
108
+ end
70
109
  end
71
- sheet_texts.join("\n")
72
110
  end
73
111
 
74
112
  class SheetListener < SAXListener
@@ -76,6 +114,7 @@ module ChupaText
76
114
 
77
115
  def initialize(sheet)
78
116
  @sheet = sheet
117
+ @cell_type = nil
79
118
  @in_v = false
80
119
  end
81
120
 
@@ -84,13 +123,22 @@ module ChupaText
84
123
  case local_name
85
124
  when "row"
86
125
  @sheet << []
126
+ when "c"
127
+ @cell_type = parse_cell_type(attributes["t"])
128
+ # when "is" # TODO
87
129
  when "v"
88
130
  @in_v = true
89
131
  end
90
132
  end
91
133
 
92
134
  def end_element(uri, local_name, qname)
93
- @in_v = false
135
+ return unless uri == URI
136
+ case local_name
137
+ when "c"
138
+ @cell_type = nil
139
+ when "v"
140
+ @in_v = false
141
+ end
94
142
  end
95
143
 
96
144
  def characters(text)
@@ -102,9 +150,34 @@ module ChupaText
102
150
  end
103
151
 
104
152
  private
153
+ # https://c-rex.net/projects/samples/ooxml/e1/Part4/OOXML_P4_DOCX_ST_CellType_topic_ID0E6NEFB.html
154
+ def parse_cell_type(type)
155
+ case type
156
+ when "b"
157
+ :boolean
158
+ when "e"
159
+ :error
160
+ when "inlineStr"
161
+ :inline_string
162
+ when "n"
163
+ :number
164
+ when "s"
165
+ :shared_string
166
+ when "str"
167
+ :string
168
+ else
169
+ nil
170
+ end
171
+ end
172
+
105
173
  def add_column(text)
106
174
  return unless @in_v
107
- @sheet.last << Integer(text, 10)
175
+ case @cell_type
176
+ when :shared_string
177
+ @sheet.last << Integer(text, 10)
178
+ else
179
+ @sheet.last << text
180
+ end
108
181
  end
109
182
  end
110
183
  end
@@ -34,11 +34,12 @@ module ChupaText
34
34
  end
35
35
  end
36
36
 
37
- def decompose(data)
37
+ def decompose(data, &block)
38
38
  context = {
39
- text: "",
39
+ data: data,
40
40
  attributes: {},
41
41
  }
42
+ start_decompose(context)
42
43
  data.open do |input|
43
44
  Archive::Zip.open(input) do |zip|
44
45
  zip.each do |entry|
@@ -56,12 +57,7 @@ module ChupaText
56
57
  end
57
58
  end
58
59
  end
59
- text = accumulate_text(context)
60
- text_data = TextData.new(text, source_data: data)
61
- context[:attributes].each do |name, value|
62
- text_data[name] = value
63
- end
64
- yield(text_data)
60
+ finish_decompose(context, &block)
65
61
  end
66
62
 
67
63
  private
@@ -60,6 +60,7 @@ module ChupaText
60
60
  @prefix_to_uri = {}
61
61
  @uri_to_prefix = {}
62
62
  @in_p = false
63
+ @in_shapes = false
63
64
  end
64
65
 
65
66
  def start_prefix_mapping(prefix, uri)
@@ -86,29 +87,44 @@ module ChupaText
86
87
  @sheets << {
87
88
  name: attributes["#{table_prefix}:name"],
88
89
  rows: [],
90
+ shape_texts: [],
89
91
  }
90
92
  when "table-row"
91
93
  @sheets.last[:rows] << []
92
94
  when "table-cell"
93
95
  @sheets.last[:rows].last << {text: ""}
96
+ when "covered-table-cell"
97
+ @sheets.last[:rows].last << {text: ""}
98
+ when "shapes"
99
+ @in_shapes = true
94
100
  end
95
101
  end
96
102
  end
97
103
 
98
104
  def end_element(uri, local_name, qname)
99
- @in_p = false
100
105
  case uri
106
+ when TEXT_URI
107
+ case local_name
108
+ when "p"
109
+ @in_p = false
110
+ end
101
111
  when TABLE_URI
102
112
  case local_name
103
113
  when "table"
104
114
  sheet = @sheets.last
105
115
  text = ""
116
+ shape_texts = sheet[:shape_texts]
117
+ unless shape_texts.empty?
118
+ text << shape_texts.join("\n") << "\n"
119
+ end
106
120
  sheet[:rows].each do |row|
107
121
  cell_texts = row.collect {|cell| cell[:text]}
108
122
  next if cell_texts.all?(&:empty?)
109
123
  text << cell_texts.join("\t") << "\n"
110
124
  end
111
125
  sheet[:text] = text
126
+ when "shapes"
127
+ @in_shapes = false
112
128
  end
113
129
  end
114
130
  end
@@ -124,7 +140,15 @@ module ChupaText
124
140
  private
125
141
  def add_text(text)
126
142
  return unless @in_p
127
- @sheets.last[:rows].last.last[:text] << text
143
+ sheet = @sheets.last
144
+ if @in_shapes
145
+ sheet[:shape_texts] << text
146
+ else
147
+ sheet[:rows].last.last[:text] << text
148
+ end
149
+ rescue
150
+ pp [text, @sheets]
151
+ raise
128
152
  end
129
153
  end
130
154
  end