chupa-text 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/chupa-text.gemspec +2 -1
  3. data/doc/text/news.md +42 -0
  4. data/lib/chupa-text/data.rb +19 -2
  5. data/lib/chupa-text/decomposers/csv.rb +20 -4
  6. data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
  8. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
  9. data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
  11. data/lib/chupa-text/decomposers/tar.rb +18 -12
  12. data/lib/chupa-text/decomposers/zip.rb +30 -4
  13. data/lib/chupa-text/extractor.rb +5 -3
  14. data/lib/chupa-text/path-converter.rb +70 -0
  15. data/lib/chupa-text/utf8-converter.rb +117 -0
  16. data/lib/chupa-text/version.rb +1 -1
  17. data/test/command/test-chupa-text.rb +4 -4
  18. data/test/decomposers/test-csv.rb +18 -3
  19. data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
  20. data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
  21. data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
  22. data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
  23. data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
  24. data/test/decomposers/test-tar.rb +18 -1
  25. data/test/decomposers/test-zip.rb +31 -1
  26. data/test/fixture/ods/covered-table-cell.ods +0 -0
  27. data/test/fixture/ods/shapes.ods +0 -0
  28. data/test/fixture/tar/utf-8.tar +0 -0
  29. data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
  30. data/test/fixture/zip/cp932.zip +0 -0
  31. data/test/fixture/zip/utf-8.zip +0 -0
  32. data/test/helper.rb +31 -1
  33. data/test/test-data.rb +7 -3
  34. data/test/test-extractor.rb +108 -1
  35. metadata +29 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 65f2eb5be135f3db4a0cbb81bc4a268bcb752934da6d9aa32740842bb4cc674f
4
- data.tar.gz: 72f8487dc57b67293e001621f6b63ece4680cf1069d9f98e3daa89708571766b
3
+ metadata.gz: 8a248b3b82aeb4ce65bd7b498dc2aab9ad6e43e6d5ea70a304163362c3e723e7
4
+ data.tar.gz: 82f3f02b63235924cab8608f1fde8d4ecbe4cb0e1aa92f12bf9ecff1debab21d
5
5
  SHA512:
6
- metadata.gz: 32149617d9c921de856aa9fb3856a8a29fde5e9e1872a16241235499b1b8e014941928fb040ad60d52c15cbc4b69d84a9860bd432074964865e7f1e85d673a1e
7
- data.tar.gz: 85c2820468547c9b34fef27faa2db0fc9d8208a266949177b2e4f2269bfadcd8f5b534c0cdb4aff9e28360c1509dd6b2d06ef72c7d1e268f399e97595a133a2c
6
+ metadata.gz: f2c3acee194db640b3b3e01a913cfc00727ed7b8e0e4b353f5fbb2d1723f689846164d44f2ffcf1874b2948ec3f3b578dacd49d27934693ad71f24b9901373a2
7
+ data.tar.gz: 12bd3c5ed94f85bad888e56a62ef478a606e8ce5fe8f376f7c2418924ef91094b1a6c150c60fd80278a7541bb5e177b7d9cb4b7ec5b67a482b8ff74c67953ba2
data/chupa-text.gemspec CHANGED
@@ -50,7 +50,8 @@ Gem::Specification.new do |spec|
50
50
  spec.executables = Dir.glob("*")
51
51
  end
52
52
 
53
- spec.add_runtime_dependency("archive-zip")
53
+ spec.add_runtime_dependency("archive-zip", ">= 0.12.0")
54
+ spec.add_runtime_dependency("csv", ">= 3.0.4")
54
55
 
55
56
  spec.add_development_dependency("bundler")
56
57
  spec.add_development_dependency("nokogiri")
data/doc/text/news.md CHANGED
@@ -1,5 +1,47 @@
1
1
  # News
2
2
 
3
+ ## 1.1.6: 2019-03-01
4
+
5
+ ### Improvements
6
+
7
+ * `zip`:
8
+
9
+ * Added support for multibyte path.
10
+
11
+ * Added error check.
12
+
13
+ * `tar`:
14
+
15
+ * Added support for multibyte path.
16
+
17
+ * Reduced memory usage.
18
+
19
+ * Changed to the extracted text encoding to UTF-8.
20
+
21
+ * Added support BOM detection.
22
+
23
+ * Improved binary data detection.
24
+
25
+ * `office-open-xml-workbook`:
26
+
27
+ * Added support for not shared string cell values.
28
+
29
+ * Changed to emit data per sheet.
30
+
31
+ * `office-open-xml-presentation`:
32
+
33
+ * Changed to emit data per slide.
34
+
35
+ * `csv`:
36
+
37
+ * Added error check.
38
+
39
+ * `opendocument-spreadsheet`:
40
+
41
+ * Added support for concatenated cell.
42
+
43
+ * Added support for shapes.
44
+
3
45
  ## 1.1.5: 2019-02-28
4
46
 
5
47
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -18,6 +18,8 @@ require "cgi/util"
18
18
  require "uri"
19
19
  require "open-uri"
20
20
 
21
+ require "chupa-text/utf8-converter"
22
+
21
23
  module ChupaText
22
24
  class Data
23
25
  # @return [URI, nil] The URI of the data if the data is for remote
@@ -190,6 +192,18 @@ module ChupaText
190
192
  @need_screenshot
191
193
  end
192
194
 
195
+ def to_utf8_body_data
196
+ b = body
197
+ return self if b.nil?
198
+ converter = UTF8Converter.new(b)
199
+ utf8_body = converter.convert
200
+ if b.equal?(utf8_body)
201
+ self
202
+ else
203
+ TextData.new(utf8_body, source_data: self)
204
+ end
205
+ end
206
+
193
207
  private
194
208
  def guess_mime_type
195
209
  guess_mime_type_from_uri or
@@ -203,7 +217,10 @@ module ChupaText
203
217
  def guess_mime_type_from_body
204
218
  mime_type = nil
205
219
  change_encoding(body, "UTF-8") do |utf8_body|
206
- mime_type = "text/plain" if utf8_body.valid_encoding?
220
+ return nil unless utf8_body.valid_encoding?
221
+ n_null_characters = utf8_body.count("\u0000")
222
+ return nil if n_null_characters > (utf8_body.bytesize * 0.01)
223
+ mime_type = "text/plain"
207
224
  end
208
225
  mime_type
209
226
  end
@@ -20,6 +20,8 @@ require "csv"
20
20
  module ChupaText
21
21
  module Decomposers
22
22
  class CSV < Decomposer
23
+ include Loggable
24
+
23
25
  registry.register("csv", self)
24
26
 
25
27
  def target?(data)
@@ -36,10 +38,20 @@ module ChupaText
36
38
  def decompose(data)
37
39
  text = ""
38
40
  data.open do |input|
39
- csv = ::CSV.new(input)
40
- csv.each do |row|
41
- text << row.join(" ")
42
- text << "\n"
41
+ begin
42
+ csv = ::CSV.new(input, liberal_parsing: true)
43
+ csv.each do |row|
44
+ text << row.join("\t")
45
+ text << "\n"
46
+ end
47
+ rescue ::CSV::MalformedCSVError => csv_error
48
+ error do
49
+ message = "#{log_tag} Failed to parse CSV: "
50
+ message << "#{csv_error.class}: #{csv_error.message}\n"
51
+ message << csv_error.backtrace.join("\n")
52
+ message
53
+ end
54
+ return
43
55
  end
44
56
  end
45
57
 
@@ -78,6 +90,10 @@ module ChupaText
78
90
  SVG
79
91
  Screenshot.new(mime_type, data)
80
92
  end
93
+
94
+ def log_tag
95
+ "[decomposer][csv]"
96
+ end
81
97
  end
82
98
  end
83
99
  end
@@ -40,12 +40,24 @@ module ChupaText
40
40
  end
41
41
 
42
42
  private
43
+ def start_decompose(context)
44
+ context[:text] = ""
45
+ end
46
+
43
47
  def process_entry(entry, context)
44
48
  case entry.zip_path
45
49
  when "word/document.xml"
46
50
  extract_text(entry, context[:text])
47
51
  end
48
52
  end
53
+
54
+ def finish_decompose(context, &block)
55
+ text_data = TextData.new(context[:text], source_data: context[:data])
56
+ context[:attributes].each do |name, value|
57
+ text_data[name] = value
58
+ end
59
+ yield(text_data)
60
+ end
49
61
  end
50
62
  end
51
63
  end
@@ -48,19 +48,33 @@ module ChupaText
48
48
  end
49
49
 
50
50
  private
51
+ def start_decompose(context)
52
+ context[:slides] = []
53
+ end
54
+
51
55
  def process_entry(entry, context)
52
56
  case entry.zip_path
53
57
  when /\Appt\/slides\/slide(\d+)\.xml/
54
58
  nth_slide = Integer($1, 10)
55
59
  slide_text = ""
56
60
  extract_text(entry, slide_text)
57
- context[:slides] ||= []
58
61
  context[:slides] << [nth_slide, slide_text]
59
62
  end
60
63
  end
61
64
 
62
- def accumulate_text(context)
63
- context[:slides].sort_by(&:first).collect(&:last).join("\n")
65
+ def finish_decompose(context, &block)
66
+ metadata = TextData.new("", source_data: context[:data])
67
+ context[:attributes].each do |name, value|
68
+ metadata[name] = value
69
+ end
70
+ yield(metadata)
71
+
72
+ slide_texts = context[:slides].sort_by(&:first).collect(&:last)
73
+ slide_texts.each_with_index do |slide_text, i|
74
+ text_data = TextData.new(slide_text, source_data: context[:data])
75
+ text_data["index"] = i
76
+ yield(text_data)
77
+ end
64
78
  end
65
79
  end
66
80
  end
@@ -40,35 +40,73 @@ module ChupaText
40
40
  end
41
41
 
42
42
  private
43
+ def start_decompose(context)
44
+ context[:shared_strings] = []
45
+ context[:sheet_names] = []
46
+ context[:sheets] = []
47
+ end
48
+
43
49
  def process_entry(entry, context)
44
50
  case entry.zip_path
45
51
  when "xl/sharedStrings.xml"
46
- context[:shared_strings] = []
47
52
  extract_text(entry, context[:shared_strings])
53
+ when "xl/workbook.xml"
54
+ listener = WorkbookListener.new(context[:sheet_names])
55
+ parse(entry.file_data, listener)
48
56
  when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
49
57
  nth_sheet = Integer($1, 10)
50
58
  sheet = []
51
59
  listener = SheetListener.new(sheet)
52
60
  parse(entry.file_data, listener)
53
- context[:sheets] ||= []
54
61
  context[:sheets] << [nth_sheet, sheet]
55
62
  end
56
63
  end
57
64
 
58
- def accumulate_text(context)
65
+ def finish_decompose(context, &block)
66
+ metadata = TextData.new("", source_data: context[:data])
67
+ context[:attributes].each do |name, value|
68
+ metadata[name] = value
69
+ end
70
+ yield(metadata)
71
+
59
72
  shared_strings = context[:shared_strings]
60
73
  sheets = context[:sheets].sort_by(&:first).collect(&:last)
61
- sheet_texts = sheets.collect do |sheet|
74
+ sheet_names = context[:sheet_names]
75
+ sheets.each_with_index do |sheet, i|
62
76
  sheet_text = ""
63
77
  sheet.each do |row|
64
- row_texts = row.collect do |index|
65
- shared_strings[index]
78
+ row_texts = row.collect do |cell|
79
+ case cell
80
+ when Integer
81
+ shared_strings[cell]
82
+ else
83
+ cell
84
+ end
66
85
  end
67
86
  sheet_text << row_texts.join("\t") << "\n"
68
87
  end
69
- sheet_text
88
+ text_data = TextData.new(sheet_text, source_data: context[:data])
89
+ text_data["index"] = i
90
+ name = sheet_names[i]
91
+ text_data["name"] = name if name
92
+ yield(text_data)
93
+ end
94
+ end
95
+
96
+ class WorkbookListener < SAXListener
97
+ URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
98
+
99
+ def initialize(sheet_names)
100
+ @sheet_names = sheet_names
101
+ end
102
+
103
+ def start_element(uri, local_name, qname, attributes)
104
+ return unless uri == URI
105
+ case local_name
106
+ when "sheet"
107
+ @sheet_names << attributes["name"]
108
+ end
70
109
  end
71
- sheet_texts.join("\n")
72
110
  end
73
111
 
74
112
  class SheetListener < SAXListener
@@ -76,6 +114,7 @@ module ChupaText
76
114
 
77
115
  def initialize(sheet)
78
116
  @sheet = sheet
117
+ @cell_type = nil
79
118
  @in_v = false
80
119
  end
81
120
 
@@ -84,13 +123,22 @@ module ChupaText
84
123
  case local_name
85
124
  when "row"
86
125
  @sheet << []
126
+ when "c"
127
+ @cell_type = parse_cell_type(attributes["t"])
128
+ # when "is" # TODO
87
129
  when "v"
88
130
  @in_v = true
89
131
  end
90
132
  end
91
133
 
92
134
  def end_element(uri, local_name, qname)
93
- @in_v = false
135
+ return unless uri == URI
136
+ case local_name
137
+ when "c"
138
+ @cell_type = nil
139
+ when "v"
140
+ @in_v = false
141
+ end
94
142
  end
95
143
 
96
144
  def characters(text)
@@ -102,9 +150,34 @@ module ChupaText
102
150
  end
103
151
 
104
152
  private
153
+ # https://c-rex.net/projects/samples/ooxml/e1/Part4/OOXML_P4_DOCX_ST_CellType_topic_ID0E6NEFB.html
154
+ def parse_cell_type(type)
155
+ case type
156
+ when "b"
157
+ :boolean
158
+ when "e"
159
+ :error
160
+ when "inlineStr"
161
+ :inline_string
162
+ when "n"
163
+ :number
164
+ when "s"
165
+ :shared_string
166
+ when "str"
167
+ :string
168
+ else
169
+ nil
170
+ end
171
+ end
172
+
105
173
  def add_column(text)
106
174
  return unless @in_v
107
- @sheet.last << Integer(text, 10)
175
+ case @cell_type
176
+ when :shared_string
177
+ @sheet.last << Integer(text, 10)
178
+ else
179
+ @sheet.last << text
180
+ end
108
181
  end
109
182
  end
110
183
  end
@@ -34,11 +34,12 @@ module ChupaText
34
34
  end
35
35
  end
36
36
 
37
- def decompose(data)
37
+ def decompose(data, &block)
38
38
  context = {
39
- text: "",
39
+ data: data,
40
40
  attributes: {},
41
41
  }
42
+ start_decompose(context)
42
43
  data.open do |input|
43
44
  Archive::Zip.open(input) do |zip|
44
45
  zip.each do |entry|
@@ -56,12 +57,7 @@ module ChupaText
56
57
  end
57
58
  end
58
59
  end
59
- text = accumulate_text(context)
60
- text_data = TextData.new(text, source_data: data)
61
- context[:attributes].each do |name, value|
62
- text_data[name] = value
63
- end
64
- yield(text_data)
60
+ finish_decompose(context, &block)
65
61
  end
66
62
 
67
63
  private
@@ -60,6 +60,7 @@ module ChupaText
60
60
  @prefix_to_uri = {}
61
61
  @uri_to_prefix = {}
62
62
  @in_p = false
63
+ @in_shapes = false
63
64
  end
64
65
 
65
66
  def start_prefix_mapping(prefix, uri)
@@ -86,29 +87,44 @@ module ChupaText
86
87
  @sheets << {
87
88
  name: attributes["#{table_prefix}:name"],
88
89
  rows: [],
90
+ shape_texts: [],
89
91
  }
90
92
  when "table-row"
91
93
  @sheets.last[:rows] << []
92
94
  when "table-cell"
93
95
  @sheets.last[:rows].last << {text: ""}
96
+ when "covered-table-cell"
97
+ @sheets.last[:rows].last << {text: ""}
98
+ when "shapes"
99
+ @in_shapes = true
94
100
  end
95
101
  end
96
102
  end
97
103
 
98
104
  def end_element(uri, local_name, qname)
99
- @in_p = false
100
105
  case uri
106
+ when TEXT_URI
107
+ case local_name
108
+ when "p"
109
+ @in_p = false
110
+ end
101
111
  when TABLE_URI
102
112
  case local_name
103
113
  when "table"
104
114
  sheet = @sheets.last
105
115
  text = ""
116
+ shape_texts = sheet[:shape_texts]
117
+ unless shape_texts.empty?
118
+ text << shape_texts.join("\n") << "\n"
119
+ end
106
120
  sheet[:rows].each do |row|
107
121
  cell_texts = row.collect {|cell| cell[:text]}
108
122
  next if cell_texts.all?(&:empty?)
109
123
  text << cell_texts.join("\t") << "\n"
110
124
  end
111
125
  sheet[:text] = text
126
+ when "shapes"
127
+ @in_shapes = false
112
128
  end
113
129
  end
114
130
  end
@@ -124,7 +140,15 @@ module ChupaText
124
140
  private
125
141
  def add_text(text)
126
142
  return unless @in_p
127
- @sheets.last[:rows].last.last[:text] << text
143
+ sheet = @sheets.last
144
+ if @in_shapes
145
+ sheet[:shape_texts] << text
146
+ else
147
+ sheet[:rows].last.last[:text] << text
148
+ end
149
+ rescue
150
+ pp [text, @sheets]
151
+ raise
128
152
  end
129
153
  end
130
154
  end