chupa-text 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/chupa-text.gemspec +2 -1
- data/doc/text/news.md +42 -0
- data/lib/chupa-text/data.rb +19 -2
- data/lib/chupa-text/decomposers/csv.rb +20 -4
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +12 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +17 -3
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +83 -10
- data/lib/chupa-text/decomposers/office-open-xml.rb +4 -8
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +26 -2
- data/lib/chupa-text/decomposers/tar.rb +18 -12
- data/lib/chupa-text/decomposers/zip.rb +30 -4
- data/lib/chupa-text/extractor.rb +5 -3
- data/lib/chupa-text/path-converter.rb +70 -0
- data/lib/chupa-text/utf8-converter.rb +117 -0
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +4 -4
- data/test/decomposers/test-csv.rb +18 -3
- data/test/decomposers/test-office-open-xml-presentation.rb +46 -46
- data/test/decomposers/test-office-open-xml-workbook.rb +64 -46
- data/test/decomposers/{test-open-document-presentation.rb → test-opendocument-presentation.rb} +0 -0
- data/test/decomposers/{test-open-document-spreadsheet.rb → test-opendocument-spreadsheet.rb} +35 -19
- data/test/decomposers/{test-open-document-text.rb → test-opendocument-text.rb} +0 -0
- data/test/decomposers/test-tar.rb +18 -1
- data/test/decomposers/test-zip.rb +31 -1
- data/test/fixture/ods/covered-table-cell.ods +0 -0
- data/test/fixture/ods/shapes.ods +0 -0
- data/test/fixture/tar/utf-8.tar +0 -0
- data/test/fixture/xlsx/not-shared-cell.xlsx +0 -0
- data/test/fixture/zip/cp932.zip +0 -0
- data/test/fixture/zip/utf-8.zip +0 -0
- data/test/helper.rb +31 -1
- data/test/test-data.rb +7 -3
- data/test/test-extractor.rb +108 -1
- metadata +29 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8a248b3b82aeb4ce65bd7b498dc2aab9ad6e43e6d5ea70a304163362c3e723e7
|
|
4
|
+
data.tar.gz: 82f3f02b63235924cab8608f1fde8d4ecbe4cb0e1aa92f12bf9ecff1debab21d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f2c3acee194db640b3b3e01a913cfc00727ed7b8e0e4b353f5fbb2d1723f689846164d44f2ffcf1874b2948ec3f3b578dacd49d27934693ad71f24b9901373a2
|
|
7
|
+
data.tar.gz: 12bd3c5ed94f85bad888e56a62ef478a606e8ce5fe8f376f7c2418924ef91094b1a6c150c60fd80278a7541bb5e177b7d9cb4b7ec5b67a482b8ff74c67953ba2
|
data/chupa-text.gemspec
CHANGED
|
@@ -50,7 +50,8 @@ Gem::Specification.new do |spec|
|
|
|
50
50
|
spec.executables = Dir.glob("*")
|
|
51
51
|
end
|
|
52
52
|
|
|
53
|
-
spec.add_runtime_dependency("archive-zip")
|
|
53
|
+
spec.add_runtime_dependency("archive-zip", ">= 0.12.0")
|
|
54
|
+
spec.add_runtime_dependency("csv", ">= 3.0.4")
|
|
54
55
|
|
|
55
56
|
spec.add_development_dependency("bundler")
|
|
56
57
|
spec.add_development_dependency("nokogiri")
|
data/doc/text/news.md
CHANGED
|
@@ -1,5 +1,47 @@
|
|
|
1
1
|
# News
|
|
2
2
|
|
|
3
|
+
## 1.1.6: 2019-03-01
|
|
4
|
+
|
|
5
|
+
### Improvements
|
|
6
|
+
|
|
7
|
+
* `zip`:
|
|
8
|
+
|
|
9
|
+
* Added support for multibyte path.
|
|
10
|
+
|
|
11
|
+
* Added error check.
|
|
12
|
+
|
|
13
|
+
* `tar`:
|
|
14
|
+
|
|
15
|
+
* Added support for multibyte path.
|
|
16
|
+
|
|
17
|
+
* Reduced memory usage.
|
|
18
|
+
|
|
19
|
+
* Changed to the extracted text encoding to UTF-8.
|
|
20
|
+
|
|
21
|
+
* Added support BOM detection.
|
|
22
|
+
|
|
23
|
+
* Improved binary data detection.
|
|
24
|
+
|
|
25
|
+
* `office-open-xml-workbook`:
|
|
26
|
+
|
|
27
|
+
* Added support for not shared string cell values.
|
|
28
|
+
|
|
29
|
+
* Changed to emit data per sheet.
|
|
30
|
+
|
|
31
|
+
* `office-open-xml-presentation`:
|
|
32
|
+
|
|
33
|
+
* Changed to emit data per slide.
|
|
34
|
+
|
|
35
|
+
* `csv`:
|
|
36
|
+
|
|
37
|
+
* Added error check.
|
|
38
|
+
|
|
39
|
+
* `opendocument-spreadsheet`:
|
|
40
|
+
|
|
41
|
+
* Added support for concatenated cell.
|
|
42
|
+
|
|
43
|
+
* Added support for shapes.
|
|
44
|
+
|
|
3
45
|
## 1.1.5: 2019-02-28
|
|
4
46
|
|
|
5
47
|
### Improvements
|
data/lib/chupa-text/data.rb
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2013-
|
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
|
2
2
|
#
|
|
3
3
|
# This library is free software; you can redistribute it and/or
|
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
|
@@ -18,6 +18,8 @@ require "cgi/util"
|
|
|
18
18
|
require "uri"
|
|
19
19
|
require "open-uri"
|
|
20
20
|
|
|
21
|
+
require "chupa-text/utf8-converter"
|
|
22
|
+
|
|
21
23
|
module ChupaText
|
|
22
24
|
class Data
|
|
23
25
|
# @return [URI, nil] The URI of the data if the data is for remote
|
|
@@ -190,6 +192,18 @@ module ChupaText
|
|
|
190
192
|
@need_screenshot
|
|
191
193
|
end
|
|
192
194
|
|
|
195
|
+
def to_utf8_body_data
|
|
196
|
+
b = body
|
|
197
|
+
return self if b.nil?
|
|
198
|
+
converter = UTF8Converter.new(b)
|
|
199
|
+
utf8_body = converter.convert
|
|
200
|
+
if b.equal?(utf8_body)
|
|
201
|
+
self
|
|
202
|
+
else
|
|
203
|
+
TextData.new(utf8_body, source_data: self)
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
193
207
|
private
|
|
194
208
|
def guess_mime_type
|
|
195
209
|
guess_mime_type_from_uri or
|
|
@@ -203,7 +217,10 @@ module ChupaText
|
|
|
203
217
|
def guess_mime_type_from_body
|
|
204
218
|
mime_type = nil
|
|
205
219
|
change_encoding(body, "UTF-8") do |utf8_body|
|
|
206
|
-
|
|
220
|
+
return nil unless utf8_body.valid_encoding?
|
|
221
|
+
n_null_characters = utf8_body.count("\u0000")
|
|
222
|
+
return nil if n_null_characters > (utf8_body.bytesize * 0.01)
|
|
223
|
+
mime_type = "text/plain"
|
|
207
224
|
end
|
|
208
225
|
mime_type
|
|
209
226
|
end
|
|
@@ -20,6 +20,8 @@ require "csv"
|
|
|
20
20
|
module ChupaText
|
|
21
21
|
module Decomposers
|
|
22
22
|
class CSV < Decomposer
|
|
23
|
+
include Loggable
|
|
24
|
+
|
|
23
25
|
registry.register("csv", self)
|
|
24
26
|
|
|
25
27
|
def target?(data)
|
|
@@ -36,10 +38,20 @@ module ChupaText
|
|
|
36
38
|
def decompose(data)
|
|
37
39
|
text = ""
|
|
38
40
|
data.open do |input|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
begin
|
|
42
|
+
csv = ::CSV.new(input, liberal_parsing: true)
|
|
43
|
+
csv.each do |row|
|
|
44
|
+
text << row.join("\t")
|
|
45
|
+
text << "\n"
|
|
46
|
+
end
|
|
47
|
+
rescue ::CSV::MalformedCSVError => csv_error
|
|
48
|
+
error do
|
|
49
|
+
message = "#{log_tag} Failed to parse CSV: "
|
|
50
|
+
message << "#{csv_error.class}: #{csv_error.message}\n"
|
|
51
|
+
message << csv_error.backtrace.join("\n")
|
|
52
|
+
message
|
|
53
|
+
end
|
|
54
|
+
return
|
|
43
55
|
end
|
|
44
56
|
end
|
|
45
57
|
|
|
@@ -78,6 +90,10 @@ module ChupaText
|
|
|
78
90
|
SVG
|
|
79
91
|
Screenshot.new(mime_type, data)
|
|
80
92
|
end
|
|
93
|
+
|
|
94
|
+
def log_tag
|
|
95
|
+
"[decomposer][csv]"
|
|
96
|
+
end
|
|
81
97
|
end
|
|
82
98
|
end
|
|
83
99
|
end
|
|
@@ -40,12 +40,24 @@ module ChupaText
|
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
private
|
|
43
|
+
def start_decompose(context)
|
|
44
|
+
context[:text] = ""
|
|
45
|
+
end
|
|
46
|
+
|
|
43
47
|
def process_entry(entry, context)
|
|
44
48
|
case entry.zip_path
|
|
45
49
|
when "word/document.xml"
|
|
46
50
|
extract_text(entry, context[:text])
|
|
47
51
|
end
|
|
48
52
|
end
|
|
53
|
+
|
|
54
|
+
def finish_decompose(context, &block)
|
|
55
|
+
text_data = TextData.new(context[:text], source_data: context[:data])
|
|
56
|
+
context[:attributes].each do |name, value|
|
|
57
|
+
text_data[name] = value
|
|
58
|
+
end
|
|
59
|
+
yield(text_data)
|
|
60
|
+
end
|
|
49
61
|
end
|
|
50
62
|
end
|
|
51
63
|
end
|
|
@@ -48,19 +48,33 @@ module ChupaText
|
|
|
48
48
|
end
|
|
49
49
|
|
|
50
50
|
private
|
|
51
|
+
def start_decompose(context)
|
|
52
|
+
context[:slides] = []
|
|
53
|
+
end
|
|
54
|
+
|
|
51
55
|
def process_entry(entry, context)
|
|
52
56
|
case entry.zip_path
|
|
53
57
|
when /\Appt\/slides\/slide(\d+)\.xml/
|
|
54
58
|
nth_slide = Integer($1, 10)
|
|
55
59
|
slide_text = ""
|
|
56
60
|
extract_text(entry, slide_text)
|
|
57
|
-
context[:slides] ||= []
|
|
58
61
|
context[:slides] << [nth_slide, slide_text]
|
|
59
62
|
end
|
|
60
63
|
end
|
|
61
64
|
|
|
62
|
-
def
|
|
63
|
-
context[:
|
|
65
|
+
def finish_decompose(context, &block)
|
|
66
|
+
metadata = TextData.new("", source_data: context[:data])
|
|
67
|
+
context[:attributes].each do |name, value|
|
|
68
|
+
metadata[name] = value
|
|
69
|
+
end
|
|
70
|
+
yield(metadata)
|
|
71
|
+
|
|
72
|
+
slide_texts = context[:slides].sort_by(&:first).collect(&:last)
|
|
73
|
+
slide_texts.each_with_index do |slide_text, i|
|
|
74
|
+
text_data = TextData.new(slide_text, source_data: context[:data])
|
|
75
|
+
text_data["index"] = i
|
|
76
|
+
yield(text_data)
|
|
77
|
+
end
|
|
64
78
|
end
|
|
65
79
|
end
|
|
66
80
|
end
|
|
@@ -40,35 +40,73 @@ module ChupaText
|
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
private
|
|
43
|
+
def start_decompose(context)
|
|
44
|
+
context[:shared_strings] = []
|
|
45
|
+
context[:sheet_names] = []
|
|
46
|
+
context[:sheets] = []
|
|
47
|
+
end
|
|
48
|
+
|
|
43
49
|
def process_entry(entry, context)
|
|
44
50
|
case entry.zip_path
|
|
45
51
|
when "xl/sharedStrings.xml"
|
|
46
|
-
context[:shared_strings] = []
|
|
47
52
|
extract_text(entry, context[:shared_strings])
|
|
53
|
+
when "xl/workbook.xml"
|
|
54
|
+
listener = WorkbookListener.new(context[:sheet_names])
|
|
55
|
+
parse(entry.file_data, listener)
|
|
48
56
|
when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
|
|
49
57
|
nth_sheet = Integer($1, 10)
|
|
50
58
|
sheet = []
|
|
51
59
|
listener = SheetListener.new(sheet)
|
|
52
60
|
parse(entry.file_data, listener)
|
|
53
|
-
context[:sheets] ||= []
|
|
54
61
|
context[:sheets] << [nth_sheet, sheet]
|
|
55
62
|
end
|
|
56
63
|
end
|
|
57
64
|
|
|
58
|
-
def
|
|
65
|
+
def finish_decompose(context, &block)
|
|
66
|
+
metadata = TextData.new("", source_data: context[:data])
|
|
67
|
+
context[:attributes].each do |name, value|
|
|
68
|
+
metadata[name] = value
|
|
69
|
+
end
|
|
70
|
+
yield(metadata)
|
|
71
|
+
|
|
59
72
|
shared_strings = context[:shared_strings]
|
|
60
73
|
sheets = context[:sheets].sort_by(&:first).collect(&:last)
|
|
61
|
-
|
|
74
|
+
sheet_names = context[:sheet_names]
|
|
75
|
+
sheets.each_with_index do |sheet, i|
|
|
62
76
|
sheet_text = ""
|
|
63
77
|
sheet.each do |row|
|
|
64
|
-
row_texts = row.collect do |
|
|
65
|
-
|
|
78
|
+
row_texts = row.collect do |cell|
|
|
79
|
+
case cell
|
|
80
|
+
when Integer
|
|
81
|
+
shared_strings[cell]
|
|
82
|
+
else
|
|
83
|
+
cell
|
|
84
|
+
end
|
|
66
85
|
end
|
|
67
86
|
sheet_text << row_texts.join("\t") << "\n"
|
|
68
87
|
end
|
|
69
|
-
sheet_text
|
|
88
|
+
text_data = TextData.new(sheet_text, source_data: context[:data])
|
|
89
|
+
text_data["index"] = i
|
|
90
|
+
name = sheet_names[i]
|
|
91
|
+
text_data["name"] = name if name
|
|
92
|
+
yield(text_data)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
class WorkbookListener < SAXListener
|
|
97
|
+
URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
|
98
|
+
|
|
99
|
+
def initialize(sheet_names)
|
|
100
|
+
@sheet_names = sheet_names
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def start_element(uri, local_name, qname, attributes)
|
|
104
|
+
return unless uri == URI
|
|
105
|
+
case local_name
|
|
106
|
+
when "sheet"
|
|
107
|
+
@sheet_names << attributes["name"]
|
|
108
|
+
end
|
|
70
109
|
end
|
|
71
|
-
sheet_texts.join("\n")
|
|
72
110
|
end
|
|
73
111
|
|
|
74
112
|
class SheetListener < SAXListener
|
|
@@ -76,6 +114,7 @@ module ChupaText
|
|
|
76
114
|
|
|
77
115
|
def initialize(sheet)
|
|
78
116
|
@sheet = sheet
|
|
117
|
+
@cell_type = nil
|
|
79
118
|
@in_v = false
|
|
80
119
|
end
|
|
81
120
|
|
|
@@ -84,13 +123,22 @@ module ChupaText
|
|
|
84
123
|
case local_name
|
|
85
124
|
when "row"
|
|
86
125
|
@sheet << []
|
|
126
|
+
when "c"
|
|
127
|
+
@cell_type = parse_cell_type(attributes["t"])
|
|
128
|
+
# when "is" # TODO
|
|
87
129
|
when "v"
|
|
88
130
|
@in_v = true
|
|
89
131
|
end
|
|
90
132
|
end
|
|
91
133
|
|
|
92
134
|
def end_element(uri, local_name, qname)
|
|
93
|
-
|
|
135
|
+
return unless uri == URI
|
|
136
|
+
case local_name
|
|
137
|
+
when "c"
|
|
138
|
+
@cell_type = nil
|
|
139
|
+
when "v"
|
|
140
|
+
@in_v = false
|
|
141
|
+
end
|
|
94
142
|
end
|
|
95
143
|
|
|
96
144
|
def characters(text)
|
|
@@ -102,9 +150,34 @@ module ChupaText
|
|
|
102
150
|
end
|
|
103
151
|
|
|
104
152
|
private
|
|
153
|
+
# https://c-rex.net/projects/samples/ooxml/e1/Part4/OOXML_P4_DOCX_ST_CellType_topic_ID0E6NEFB.html
|
|
154
|
+
def parse_cell_type(type)
|
|
155
|
+
case type
|
|
156
|
+
when "b"
|
|
157
|
+
:boolean
|
|
158
|
+
when "e"
|
|
159
|
+
:error
|
|
160
|
+
when "inlineStr"
|
|
161
|
+
:inline_string
|
|
162
|
+
when "n"
|
|
163
|
+
:number
|
|
164
|
+
when "s"
|
|
165
|
+
:shared_string
|
|
166
|
+
when "str"
|
|
167
|
+
:string
|
|
168
|
+
else
|
|
169
|
+
nil
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
105
173
|
def add_column(text)
|
|
106
174
|
return unless @in_v
|
|
107
|
-
@
|
|
175
|
+
case @cell_type
|
|
176
|
+
when :shared_string
|
|
177
|
+
@sheet.last << Integer(text, 10)
|
|
178
|
+
else
|
|
179
|
+
@sheet.last << text
|
|
180
|
+
end
|
|
108
181
|
end
|
|
109
182
|
end
|
|
110
183
|
end
|
|
@@ -34,11 +34,12 @@ module ChupaText
|
|
|
34
34
|
end
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
-
def decompose(data)
|
|
37
|
+
def decompose(data, &block)
|
|
38
38
|
context = {
|
|
39
|
-
|
|
39
|
+
data: data,
|
|
40
40
|
attributes: {},
|
|
41
41
|
}
|
|
42
|
+
start_decompose(context)
|
|
42
43
|
data.open do |input|
|
|
43
44
|
Archive::Zip.open(input) do |zip|
|
|
44
45
|
zip.each do |entry|
|
|
@@ -56,12 +57,7 @@ module ChupaText
|
|
|
56
57
|
end
|
|
57
58
|
end
|
|
58
59
|
end
|
|
59
|
-
|
|
60
|
-
text_data = TextData.new(text, source_data: data)
|
|
61
|
-
context[:attributes].each do |name, value|
|
|
62
|
-
text_data[name] = value
|
|
63
|
-
end
|
|
64
|
-
yield(text_data)
|
|
60
|
+
finish_decompose(context, &block)
|
|
65
61
|
end
|
|
66
62
|
|
|
67
63
|
private
|
|
@@ -60,6 +60,7 @@ module ChupaText
|
|
|
60
60
|
@prefix_to_uri = {}
|
|
61
61
|
@uri_to_prefix = {}
|
|
62
62
|
@in_p = false
|
|
63
|
+
@in_shapes = false
|
|
63
64
|
end
|
|
64
65
|
|
|
65
66
|
def start_prefix_mapping(prefix, uri)
|
|
@@ -86,29 +87,44 @@ module ChupaText
|
|
|
86
87
|
@sheets << {
|
|
87
88
|
name: attributes["#{table_prefix}:name"],
|
|
88
89
|
rows: [],
|
|
90
|
+
shape_texts: [],
|
|
89
91
|
}
|
|
90
92
|
when "table-row"
|
|
91
93
|
@sheets.last[:rows] << []
|
|
92
94
|
when "table-cell"
|
|
93
95
|
@sheets.last[:rows].last << {text: ""}
|
|
96
|
+
when "covered-table-cell"
|
|
97
|
+
@sheets.last[:rows].last << {text: ""}
|
|
98
|
+
when "shapes"
|
|
99
|
+
@in_shapes = true
|
|
94
100
|
end
|
|
95
101
|
end
|
|
96
102
|
end
|
|
97
103
|
|
|
98
104
|
def end_element(uri, local_name, qname)
|
|
99
|
-
@in_p = false
|
|
100
105
|
case uri
|
|
106
|
+
when TEXT_URI
|
|
107
|
+
case local_name
|
|
108
|
+
when "p"
|
|
109
|
+
@in_p = false
|
|
110
|
+
end
|
|
101
111
|
when TABLE_URI
|
|
102
112
|
case local_name
|
|
103
113
|
when "table"
|
|
104
114
|
sheet = @sheets.last
|
|
105
115
|
text = ""
|
|
116
|
+
shape_texts = sheet[:shape_texts]
|
|
117
|
+
unless shape_texts.empty?
|
|
118
|
+
text << shape_texts.join("\n") << "\n"
|
|
119
|
+
end
|
|
106
120
|
sheet[:rows].each do |row|
|
|
107
121
|
cell_texts = row.collect {|cell| cell[:text]}
|
|
108
122
|
next if cell_texts.all?(&:empty?)
|
|
109
123
|
text << cell_texts.join("\t") << "\n"
|
|
110
124
|
end
|
|
111
125
|
sheet[:text] = text
|
|
126
|
+
when "shapes"
|
|
127
|
+
@in_shapes = false
|
|
112
128
|
end
|
|
113
129
|
end
|
|
114
130
|
end
|
|
@@ -124,7 +140,15 @@ module ChupaText
|
|
|
124
140
|
private
|
|
125
141
|
def add_text(text)
|
|
126
142
|
return unless @in_p
|
|
127
|
-
@sheets.last
|
|
143
|
+
sheet = @sheets.last
|
|
144
|
+
if @in_shapes
|
|
145
|
+
sheet[:shape_texts] << text
|
|
146
|
+
else
|
|
147
|
+
sheet[:rows].last.last[:text] << text
|
|
148
|
+
end
|
|
149
|
+
rescue
|
|
150
|
+
pp [text, @sheets]
|
|
151
|
+
raise
|
|
128
152
|
end
|
|
129
153
|
end
|
|
130
154
|
end
|