chupa-text 1.1.5 → 1.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/chupa-text.gemspec +2 -1
- data/doc/text/news.md +42 -0
- data/lib/chupa-text/data.rb +19 -2
- data/lib/chupa-text/decomposers/csv.rb +20 -4
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
- data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
- data/lib/chupa-text/decomposers/tar.rb +18 -12
- data/lib/chupa-text/decomposers/zip.rb +30 -4
- data/lib/chupa-text/extractor.rb +5 -3
- data/lib/chupa-text/path-converter.rb +70 -0
- data/lib/chupa-text/utf8-converter.rb +117 -0
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +4 -4
- data/test/decomposers/test-csv.rb +18 -3
- data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
- data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
- data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
- data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
- data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
- data/test/decomposers/test-tar.rb +18 -1
- data/test/decomposers/test-zip.rb +31 -1
- data/test/fixture/ods/covered-table-cell.ods +0 -0
- data/test/fixture/ods/shapes.ods +0 -0
- data/test/fixture/tar/utf-8.tar +0 -0
- data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
- data/test/fixture/zip/cp932.zip +0 -0
- data/test/fixture/zip/utf-8.zip +0 -0
- data/test/helper.rb +31 -1
- data/test/test-data.rb +7 -3
- data/test/test-extractor.rb +108 -1
- metadata +29 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a248b3b82aeb4ce65bd7b498dc2aab9ad6e43e6d5ea70a304163362c3e723e7
|
4
|
+
data.tar.gz: 82f3f02b63235924cab8608f1fde8d4ecbe4cb0e1aa92f12bf9ecff1debab21d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2c3acee194db640b3b3e01a913cfc00727ed7b8e0e4b353f5fbb2d1723f689846164d44f2ffcf1874b2948ec3f3b578dacd49d27934693ad71f24b9901373a2
|
7
|
+
data.tar.gz: 12bd3c5ed94f85bad888e56a62ef478a606e8ce5fe8f376f7c2418924ef91094b1a6c150c60fd80278a7541bb5e177b7d9cb4b7ec5b67a482b8ff74c67953ba2
|
data/chupa-text.gemspec
CHANGED
@@ -50,7 +50,8 @@ Gem::Specification.new do |spec|
|
|
50
50
|
spec.executables = Dir.glob("*")
|
51
51
|
end
|
52
52
|
|
53
|
-
spec.add_runtime_dependency("archive-zip")
|
53
|
+
spec.add_runtime_dependency("archive-zip", ">= 0.12.0")
|
54
|
+
spec.add_runtime_dependency("csv", ">= 3.0.4")
|
54
55
|
|
55
56
|
spec.add_development_dependency("bundler")
|
56
57
|
spec.add_development_dependency("nokogiri")
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,47 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.1.6: 2019-03-01
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `zip`:
|
8
|
+
|
9
|
+
* Added support for multibyte path.
|
10
|
+
|
11
|
+
* Added error check.
|
12
|
+
|
13
|
+
* `tar`:
|
14
|
+
|
15
|
+
* Added support for multibyte path.
|
16
|
+
|
17
|
+
* Reduced memory usage.
|
18
|
+
|
19
|
+
* Changed to the extracted text encoding to UTF-8.
|
20
|
+
|
21
|
+
* Added support BOM detection.
|
22
|
+
|
23
|
+
* Improved binary data detection.
|
24
|
+
|
25
|
+
* `office-open-xml-workbook`:
|
26
|
+
|
27
|
+
* Added support for not shared string cell values.
|
28
|
+
|
29
|
+
* Changed to emit data per sheet.
|
30
|
+
|
31
|
+
* `office-open-xml-presentation`:
|
32
|
+
|
33
|
+
* Changed to emit data per slide.
|
34
|
+
|
35
|
+
* `csv`:
|
36
|
+
|
37
|
+
* Added error check.
|
38
|
+
|
39
|
+
* `opendocument-spreadsheet`:
|
40
|
+
|
41
|
+
* Added support for concatenated cell.
|
42
|
+
|
43
|
+
* Added support for shapes.
|
44
|
+
|
3
45
|
## 1.1.5: 2019-02-28
|
4
46
|
|
5
47
|
### Improvements
|
data/lib/chupa-text/data.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -18,6 +18,8 @@ require "cgi/util"
|
|
18
18
|
require "uri"
|
19
19
|
require "open-uri"
|
20
20
|
|
21
|
+
require "chupa-text/utf8-converter"
|
22
|
+
|
21
23
|
module ChupaText
|
22
24
|
class Data
|
23
25
|
# @return [URI, nil] The URI of the data if the data is for remote
|
@@ -190,6 +192,18 @@ module ChupaText
|
|
190
192
|
@need_screenshot
|
191
193
|
end
|
192
194
|
|
195
|
+
def to_utf8_body_data
|
196
|
+
b = body
|
197
|
+
return self if b.nil?
|
198
|
+
converter = UTF8Converter.new(b)
|
199
|
+
utf8_body = converter.convert
|
200
|
+
if b.equal?(utf8_body)
|
201
|
+
self
|
202
|
+
else
|
203
|
+
TextData.new(utf8_body, source_data: self)
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
193
207
|
private
|
194
208
|
def guess_mime_type
|
195
209
|
guess_mime_type_from_uri or
|
@@ -203,7 +217,10 @@ module ChupaText
|
|
203
217
|
def guess_mime_type_from_body
|
204
218
|
mime_type = nil
|
205
219
|
change_encoding(body, "UTF-8") do |utf8_body|
|
206
|
-
|
220
|
+
return nil unless utf8_body.valid_encoding?
|
221
|
+
n_null_characters = utf8_body.count("\u0000")
|
222
|
+
return nil if n_null_characters > (utf8_body.bytesize * 0.01)
|
223
|
+
mime_type = "text/plain"
|
207
224
|
end
|
208
225
|
mime_type
|
209
226
|
end
|
@@ -20,6 +20,8 @@ require "csv"
|
|
20
20
|
module ChupaText
|
21
21
|
module Decomposers
|
22
22
|
class CSV < Decomposer
|
23
|
+
include Loggable
|
24
|
+
|
23
25
|
registry.register("csv", self)
|
24
26
|
|
25
27
|
def target?(data)
|
@@ -36,10 +38,20 @@ module ChupaText
|
|
36
38
|
def decompose(data)
|
37
39
|
text = ""
|
38
40
|
data.open do |input|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
begin
|
42
|
+
csv = ::CSV.new(input, liberal_parsing: true)
|
43
|
+
csv.each do |row|
|
44
|
+
text << row.join("\t")
|
45
|
+
text << "\n"
|
46
|
+
end
|
47
|
+
rescue ::CSV::MalformedCSVError => csv_error
|
48
|
+
error do
|
49
|
+
message = "#{log_tag} Failed to parse CSV: "
|
50
|
+
message << "#{csv_error.class}: #{csv_error.message}\n"
|
51
|
+
message << csv_error.backtrace.join("\n")
|
52
|
+
message
|
53
|
+
end
|
54
|
+
return
|
43
55
|
end
|
44
56
|
end
|
45
57
|
|
@@ -78,6 +90,10 @@ module ChupaText
|
|
78
90
|
SVG
|
79
91
|
Screenshot.new(mime_type, data)
|
80
92
|
end
|
93
|
+
|
94
|
+
def log_tag
|
95
|
+
"[decomposer][csv]"
|
96
|
+
end
|
81
97
|
end
|
82
98
|
end
|
83
99
|
end
|
@@ -40,12 +40,24 @@ module ChupaText
|
|
40
40
|
end
|
41
41
|
|
42
42
|
private
|
43
|
+
def start_decompose(context)
|
44
|
+
context[:text] = ""
|
45
|
+
end
|
46
|
+
|
43
47
|
def process_entry(entry, context)
|
44
48
|
case entry.zip_path
|
45
49
|
when "word/document.xml"
|
46
50
|
extract_text(entry, context[:text])
|
47
51
|
end
|
48
52
|
end
|
53
|
+
|
54
|
+
def finish_decompose(context, &block)
|
55
|
+
text_data = TextData.new(context[:text], source_data: context[:data])
|
56
|
+
context[:attributes].each do |name, value|
|
57
|
+
text_data[name] = value
|
58
|
+
end
|
59
|
+
yield(text_data)
|
60
|
+
end
|
49
61
|
end
|
50
62
|
end
|
51
63
|
end
|
@@ -48,19 +48,33 @@ module ChupaText
|
|
48
48
|
end
|
49
49
|
|
50
50
|
private
|
51
|
+
def start_decompose(context)
|
52
|
+
context[:slides] = []
|
53
|
+
end
|
54
|
+
|
51
55
|
def process_entry(entry, context)
|
52
56
|
case entry.zip_path
|
53
57
|
when /\Appt\/slides\/slide(\d+)\.xml/
|
54
58
|
nth_slide = Integer($1, 10)
|
55
59
|
slide_text = ""
|
56
60
|
extract_text(entry, slide_text)
|
57
|
-
context[:slides] ||= []
|
58
61
|
context[:slides] << [nth_slide, slide_text]
|
59
62
|
end
|
60
63
|
end
|
61
64
|
|
62
|
-
def
|
63
|
-
context[:
|
65
|
+
def finish_decompose(context, &block)
|
66
|
+
metadata = TextData.new("", source_data: context[:data])
|
67
|
+
context[:attributes].each do |name, value|
|
68
|
+
metadata[name] = value
|
69
|
+
end
|
70
|
+
yield(metadata)
|
71
|
+
|
72
|
+
slide_texts = context[:slides].sort_by(&:first).collect(&:last)
|
73
|
+
slide_texts.each_with_index do |slide_text, i|
|
74
|
+
text_data = TextData.new(slide_text, source_data: context[:data])
|
75
|
+
text_data["index"] = i
|
76
|
+
yield(text_data)
|
77
|
+
end
|
64
78
|
end
|
65
79
|
end
|
66
80
|
end
|
@@ -40,35 +40,73 @@ module ChupaText
|
|
40
40
|
end
|
41
41
|
|
42
42
|
private
|
43
|
+
def start_decompose(context)
|
44
|
+
context[:shared_strings] = []
|
45
|
+
context[:sheet_names] = []
|
46
|
+
context[:sheets] = []
|
47
|
+
end
|
48
|
+
|
43
49
|
def process_entry(entry, context)
|
44
50
|
case entry.zip_path
|
45
51
|
when "xl/sharedStrings.xml"
|
46
|
-
context[:shared_strings] = []
|
47
52
|
extract_text(entry, context[:shared_strings])
|
53
|
+
when "xl/workbook.xml"
|
54
|
+
listener = WorkbookListener.new(context[:sheet_names])
|
55
|
+
parse(entry.file_data, listener)
|
48
56
|
when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
|
49
57
|
nth_sheet = Integer($1, 10)
|
50
58
|
sheet = []
|
51
59
|
listener = SheetListener.new(sheet)
|
52
60
|
parse(entry.file_data, listener)
|
53
|
-
context[:sheets] ||= []
|
54
61
|
context[:sheets] << [nth_sheet, sheet]
|
55
62
|
end
|
56
63
|
end
|
57
64
|
|
58
|
-
def
|
65
|
+
def finish_decompose(context, &block)
|
66
|
+
metadata = TextData.new("", source_data: context[:data])
|
67
|
+
context[:attributes].each do |name, value|
|
68
|
+
metadata[name] = value
|
69
|
+
end
|
70
|
+
yield(metadata)
|
71
|
+
|
59
72
|
shared_strings = context[:shared_strings]
|
60
73
|
sheets = context[:sheets].sort_by(&:first).collect(&:last)
|
61
|
-
|
74
|
+
sheet_names = context[:sheet_names]
|
75
|
+
sheets.each_with_index do |sheet, i|
|
62
76
|
sheet_text = ""
|
63
77
|
sheet.each do |row|
|
64
|
-
row_texts = row.collect do |
|
65
|
-
|
78
|
+
row_texts = row.collect do |cell|
|
79
|
+
case cell
|
80
|
+
when Integer
|
81
|
+
shared_strings[cell]
|
82
|
+
else
|
83
|
+
cell
|
84
|
+
end
|
66
85
|
end
|
67
86
|
sheet_text << row_texts.join("\t") << "\n"
|
68
87
|
end
|
69
|
-
sheet_text
|
88
|
+
text_data = TextData.new(sheet_text, source_data: context[:data])
|
89
|
+
text_data["index"] = i
|
90
|
+
name = sheet_names[i]
|
91
|
+
text_data["name"] = name if name
|
92
|
+
yield(text_data)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class WorkbookListener < SAXListener
|
97
|
+
URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
98
|
+
|
99
|
+
def initialize(sheet_names)
|
100
|
+
@sheet_names = sheet_names
|
101
|
+
end
|
102
|
+
|
103
|
+
def start_element(uri, local_name, qname, attributes)
|
104
|
+
return unless uri == URI
|
105
|
+
case local_name
|
106
|
+
when "sheet"
|
107
|
+
@sheet_names << attributes["name"]
|
108
|
+
end
|
70
109
|
end
|
71
|
-
sheet_texts.join("\n")
|
72
110
|
end
|
73
111
|
|
74
112
|
class SheetListener < SAXListener
|
@@ -76,6 +114,7 @@ module ChupaText
|
|
76
114
|
|
77
115
|
def initialize(sheet)
|
78
116
|
@sheet = sheet
|
117
|
+
@cell_type = nil
|
79
118
|
@in_v = false
|
80
119
|
end
|
81
120
|
|
@@ -84,13 +123,22 @@ module ChupaText
|
|
84
123
|
case local_name
|
85
124
|
when "row"
|
86
125
|
@sheet << []
|
126
|
+
when "c"
|
127
|
+
@cell_type = parse_cell_type(attributes["t"])
|
128
|
+
# when "is" # TODO
|
87
129
|
when "v"
|
88
130
|
@in_v = true
|
89
131
|
end
|
90
132
|
end
|
91
133
|
|
92
134
|
def end_element(uri, local_name, qname)
|
93
|
-
|
135
|
+
return unless uri == URI
|
136
|
+
case local_name
|
137
|
+
when "c"
|
138
|
+
@cell_type = nil
|
139
|
+
when "v"
|
140
|
+
@in_v = false
|
141
|
+
end
|
94
142
|
end
|
95
143
|
|
96
144
|
def characters(text)
|
@@ -102,9 +150,34 @@ module ChupaText
|
|
102
150
|
end
|
103
151
|
|
104
152
|
private
|
153
|
+
# https://c-rex.net/projects/samples/ooxml/e1/Part4/OOXML_P4_DOCX_ST_CellType_topic_ID0E6NEFB.html
|
154
|
+
def parse_cell_type(type)
|
155
|
+
case type
|
156
|
+
when "b"
|
157
|
+
:boolean
|
158
|
+
when "e"
|
159
|
+
:error
|
160
|
+
when "inlineStr"
|
161
|
+
:inline_string
|
162
|
+
when "n"
|
163
|
+
:number
|
164
|
+
when "s"
|
165
|
+
:shared_string
|
166
|
+
when "str"
|
167
|
+
:string
|
168
|
+
else
|
169
|
+
nil
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
105
173
|
def add_column(text)
|
106
174
|
return unless @in_v
|
107
|
-
@
|
175
|
+
case @cell_type
|
176
|
+
when :shared_string
|
177
|
+
@sheet.last << Integer(text, 10)
|
178
|
+
else
|
179
|
+
@sheet.last << text
|
180
|
+
end
|
108
181
|
end
|
109
182
|
end
|
110
183
|
end
|
@@ -34,11 +34,12 @@ module ChupaText
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
def decompose(data)
|
37
|
+
def decompose(data, &block)
|
38
38
|
context = {
|
39
|
-
|
39
|
+
data: data,
|
40
40
|
attributes: {},
|
41
41
|
}
|
42
|
+
start_decompose(context)
|
42
43
|
data.open do |input|
|
43
44
|
Archive::Zip.open(input) do |zip|
|
44
45
|
zip.each do |entry|
|
@@ -56,12 +57,7 @@ module ChupaText
|
|
56
57
|
end
|
57
58
|
end
|
58
59
|
end
|
59
|
-
|
60
|
-
text_data = TextData.new(text, source_data: data)
|
61
|
-
context[:attributes].each do |name, value|
|
62
|
-
text_data[name] = value
|
63
|
-
end
|
64
|
-
yield(text_data)
|
60
|
+
finish_decompose(context, &block)
|
65
61
|
end
|
66
62
|
|
67
63
|
private
|
@@ -60,6 +60,7 @@ module ChupaText
|
|
60
60
|
@prefix_to_uri = {}
|
61
61
|
@uri_to_prefix = {}
|
62
62
|
@in_p = false
|
63
|
+
@in_shapes = false
|
63
64
|
end
|
64
65
|
|
65
66
|
def start_prefix_mapping(prefix, uri)
|
@@ -86,29 +87,44 @@ module ChupaText
|
|
86
87
|
@sheets << {
|
87
88
|
name: attributes["#{table_prefix}:name"],
|
88
89
|
rows: [],
|
90
|
+
shape_texts: [],
|
89
91
|
}
|
90
92
|
when "table-row"
|
91
93
|
@sheets.last[:rows] << []
|
92
94
|
when "table-cell"
|
93
95
|
@sheets.last[:rows].last << {text: ""}
|
96
|
+
when "covered-table-cell"
|
97
|
+
@sheets.last[:rows].last << {text: ""}
|
98
|
+
when "shapes"
|
99
|
+
@in_shapes = true
|
94
100
|
end
|
95
101
|
end
|
96
102
|
end
|
97
103
|
|
98
104
|
def end_element(uri, local_name, qname)
|
99
|
-
@in_p = false
|
100
105
|
case uri
|
106
|
+
when TEXT_URI
|
107
|
+
case local_name
|
108
|
+
when "p"
|
109
|
+
@in_p = false
|
110
|
+
end
|
101
111
|
when TABLE_URI
|
102
112
|
case local_name
|
103
113
|
when "table"
|
104
114
|
sheet = @sheets.last
|
105
115
|
text = ""
|
116
|
+
shape_texts = sheet[:shape_texts]
|
117
|
+
unless shape_texts.empty?
|
118
|
+
text << shape_texts.join("\n") << "\n"
|
119
|
+
end
|
106
120
|
sheet[:rows].each do |row|
|
107
121
|
cell_texts = row.collect {|cell| cell[:text]}
|
108
122
|
next if cell_texts.all?(&:empty?)
|
109
123
|
text << cell_texts.join("\t") << "\n"
|
110
124
|
end
|
111
125
|
sheet[:text] = text
|
126
|
+
when "shapes"
|
127
|
+
@in_shapes = false
|
112
128
|
end
|
113
129
|
end
|
114
130
|
end
|
@@ -124,7 +140,15 @@ module ChupaText
|
|
124
140
|
private
|
125
141
|
def add_text(text)
|
126
142
|
return unless @in_p
|
127
|
-
@sheets.last
|
143
|
+
sheet = @sheets.last
|
144
|
+
if @in_shapes
|
145
|
+
sheet[:shape_texts] << text
|
146
|
+
else
|
147
|
+
sheet[:rows].last.last[:text] << text
|
148
|
+
end
|
149
|
+
rescue
|
150
|
+
pp [text, @sheets]
|
151
|
+
raise
|
128
152
|
end
|
129
153
|
end
|
130
154
|
end
|