simple_xlsx_reader 1.0.1 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +8 -0
- data/CHANGELOG.md +23 -0
- data/README.md +13 -6
- data/lib/simple_xlsx_reader/version.rb +1 -1
- data/lib/simple_xlsx_reader.rb +137 -46
- data/simple_xlsx_reader.gemspec +4 -2
- data/test/date1904_test.rb +1 -1
- data/test/datetime_test.rb +3 -2
- data/test/datetimes.xlsx +0 -0
- data/test/gdocs_sheet.xlsx +0 -0
- data/test/gdocs_sheet_test.rb +15 -0
- data/test/lower_case_sharedstrings.xlsx +0 -0
- data/test/lower_case_sharedstrings_test.rb +15 -0
- data/test/performance_test.rb +3 -3
- data/test/sesame_street_blog.xlsx +0 -0
- data/test/simple_xlsx_reader_test.rb +176 -25
- data/test/styles.xml +4 -2
- data/test/test_helper.rb +1 -0
- metadata +29 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e2b04473235c5ed2c2764f62a627fa6f16816c36e0fcff3497be229f8666a0f7
|
4
|
+
data.tar.gz: 9367b0082f31e9cb208d9f97ed6cb67d5276a459562809460694602339dfdaad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd42f7a0b8830a2f01703dca10ae779b973566ad25e3b74d31dc3693977fa5b2b3442e47bc1a3b50723bae3bb9f31facd923f1eaba06b51cc8b927e7fb207cf3
|
7
|
+
data.tar.gz: 38ecb026b0ad5a1985d88349a839a9d2972f85596504e6f300686f9751169a3c8d62582e79119106085a9cadc066517206da117993c3a30f48a5a0c58f256b4c
|
data/.travis.yml
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,26 @@
|
|
1
|
+
### 1.0.5
|
2
|
+
|
3
|
+
* Support string or io input via `SimpleXlsxReader#parse` (@kalsan, @til)
|
4
|
+
|
5
|
+
### 1.0.4
|
6
|
+
|
7
|
+
* Fix Windows + RubyZip 1.2.1 bug preventing files from being read
|
8
|
+
* Add ability to parse hyperlinks
|
9
|
+
* Support files exported from Google Docs (@Strnadj)
|
10
|
+
|
11
|
+
### 1.0.3
|
12
|
+
|
13
|
+
Broken on Ruby 1.9; yanked.
|
14
|
+
|
15
|
+
### 1.0.2
|
16
|
+
|
17
|
+
* Fix Ruby 1.9.3-specific bug preventing parsing most sheets [middagj, eritiro]
|
18
|
+
* Better support for non-excel-generated xlsx files [bwlang]
|
19
|
+
* You don't always have a numFmtId column, and that's OK
|
20
|
+
* Sometimes 'sharedStrings.xml' can be 'sharedstrings.xml'
|
21
|
+
* Fixed parsing times very close to 12/30/1899 [Valeriy Utyaganov]
|
22
|
+
* Be more flexible with custom formats using a numFmtId < 164
|
23
|
+
|
1
24
|
### 1.0.1
|
2
25
|
|
3
26
|
* Add support for the 1904 date system [zilverline]
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# SimpleXlsxReader
|
1
|
+
# SimpleXlsxReader [](https://travis-ci.org/woahdae/simple_xlsx_reader)
|
2
2
|
|
3
3
|
An xlsx reader for Ruby that parses xlsx cell values into plain ruby
|
4
4
|
primitives and dates/times.
|
@@ -35,14 +35,21 @@ Here's the totality of the public api, in code:
|
|
35
35
|
|
36
36
|
module SimpleXlsxReader
|
37
37
|
def self.open(file_path)
|
38
|
-
Document.new(file_path).tap(&:sheets)
|
38
|
+
Document.new(file_path: file_path).tap(&:sheets)
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.parse(string_or_io)
|
42
|
+
Document.new(string_or_io: string_or_io).tap(&:sheets)
|
39
43
|
end
|
40
44
|
|
41
45
|
class Document
|
42
|
-
attr_reader :
|
46
|
+
attr_reader :string_or_io
|
47
|
+
|
48
|
+
def initialize(legacy_file_path = nil, file_path: nil, string_or_io: nil)
|
49
|
+
((file_path || legacy_file_path).nil? ^ string_or_io.nil?) ||
|
50
|
+
fail(ArgumentError, 'either file_path or string_or_io must be provided')
|
43
51
|
|
44
|
-
|
45
|
-
@file_path = file_path
|
52
|
+
@string_or_io = string_or_io || File.new(file_path || legacy_file_path)
|
46
53
|
end
|
47
54
|
|
48
55
|
def sheets
|
@@ -54,7 +61,7 @@ Here's the totality of the public api, in code:
|
|
54
61
|
end
|
55
62
|
|
56
63
|
def xml
|
57
|
-
Xml.load(
|
64
|
+
Xml.load(string_or_io)
|
58
65
|
end
|
59
66
|
|
60
67
|
class Sheet < Struct.new(:name, :rows)
|
data/lib/simple_xlsx_reader.rb
CHANGED
@@ -19,6 +19,33 @@ end
|
|
19
19
|
module SimpleXlsxReader
|
20
20
|
class CellLoadError < StandardError; end
|
21
21
|
|
22
|
+
# We support hyperlinks as a "type" even though they're technically
|
23
|
+
# represented either as a function or an external reference in the xlsx spec.
|
24
|
+
#
|
25
|
+
# Since having hyperlink data in our sheet usually means we might want to do
|
26
|
+
# something primarily with the URL (store it in the database, download it, etc),
|
27
|
+
# we go through extra effort to parse the function or follow the reference
|
28
|
+
# to represent the hyperlink primarily as a URL. However, maybe we do want
|
29
|
+
# the hyperlink "friendly name" part (as MS calls it), so here we've subclassed
|
30
|
+
# string to tack on the friendly name. This means 80% of us that just want
|
31
|
+
# the URL value will have to do nothing extra, but the 20% that might want the
|
32
|
+
# friendly name can access it.
|
33
|
+
#
|
34
|
+
# Note, by default, the value we would get by just asking the cell would
|
35
|
+
# be the "friendly name" and *not* the URL, which is tucked away in the
|
36
|
+
# function definition or a separate "relationships" meta-document.
|
37
|
+
#
|
38
|
+
# See MS documentation on the HYPERLINK function for some background:
|
39
|
+
# https://support.office.com/en-us/article/HYPERLINK-function-333c7ce6-c5ae-4164-9c47-7de9b76f577f
|
40
|
+
class Hyperlink < String
|
41
|
+
attr_reader :friendly_name
|
42
|
+
|
43
|
+
def initialize(url, friendly_name = nil)
|
44
|
+
@friendly_name = friendly_name
|
45
|
+
super(url)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
22
49
|
def self.configuration
|
23
50
|
@configuration ||= Struct.new(:catch_cell_load_errors).new.tap do |c|
|
24
51
|
c.catch_cell_load_errors = false
|
@@ -26,14 +53,21 @@ module SimpleXlsxReader
|
|
26
53
|
end
|
27
54
|
|
28
55
|
def self.open(file_path)
|
29
|
-
Document.new(file_path).tap(&:sheets)
|
56
|
+
Document.new(file_path: file_path).tap(&:sheets)
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.parse(string_or_io)
|
60
|
+
Document.new(string_or_io: string_or_io).tap(&:sheets)
|
30
61
|
end
|
31
62
|
|
32
63
|
class Document
|
33
|
-
attr_reader :
|
64
|
+
attr_reader :string_or_io
|
65
|
+
|
66
|
+
def initialize(legacy_file_path = nil, file_path: nil, string_or_io: nil)
|
67
|
+
((file_path || legacy_file_path).nil? ^ string_or_io.nil?) ||
|
68
|
+
fail(ArgumentError, 'either file_path or string_or_io must be provided')
|
34
69
|
|
35
|
-
|
36
|
-
@file_path = file_path
|
70
|
+
@string_or_io = string_or_io || File.new(file_path || legacy_file_path)
|
37
71
|
end
|
38
72
|
|
39
73
|
def sheets
|
@@ -45,7 +79,7 @@ module SimpleXlsxReader
|
|
45
79
|
end
|
46
80
|
|
47
81
|
def xml
|
48
|
-
Xml.load(
|
82
|
+
Xml.load(string_or_io)
|
49
83
|
end
|
50
84
|
|
51
85
|
class Sheet < Struct.new(:name, :rows)
|
@@ -69,28 +103,54 @@ module SimpleXlsxReader
|
|
69
103
|
##
|
70
104
|
# For internal use; stores source xml in nokogiri documents
|
71
105
|
class Xml
|
72
|
-
attr_accessor :workbook, :shared_strings, :sheets, :styles
|
106
|
+
attr_accessor :workbook, :shared_strings, :sheets, :sheet_rels, :styles
|
73
107
|
|
74
|
-
def self.load(
|
108
|
+
def self.load(string_or_io)
|
75
109
|
self.new.tap do |xml|
|
76
|
-
SimpleXlsxReader::Zip.
|
77
|
-
xml.workbook = Nokogiri::XML(zip.read('xl/workbook.xml')).remove_namespaces!
|
78
|
-
xml.styles = Nokogiri::XML(zip.read('xl/styles.xml')).remove_namespaces!
|
79
|
-
|
80
|
-
# optional feature used by excel, but not often used by xlsx
|
81
|
-
# generation libraries
|
82
|
-
if zip.file.file?('xl/sharedStrings.xml')
|
83
|
-
xml.shared_strings = Nokogiri::XML(zip.read('xl/sharedStrings.xml')).remove_namespaces!
|
84
|
-
end
|
85
|
-
|
110
|
+
SimpleXlsxReader::Zip.open_buffer(string_or_io) do |zip|
|
86
111
|
xml.sheets = []
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
112
|
+
xml.sheet_rels = []
|
113
|
+
|
114
|
+
# This weird style of enumerating over the entries lets us
|
115
|
+
# concisely assign entries in a case insensitive and
|
116
|
+
# slash insensitive ('/' vs '\') manner.
|
117
|
+
#
|
118
|
+
# RubyZip used to normalize the slashes, but doesn't now:
|
119
|
+
# https://github.com/rubyzip/rubyzip/issues/324
|
120
|
+
zip.entries.each do |entry|
|
121
|
+
if entry.name.match(/^xl.workbook\.xml$/) # xl/workbook.xml
|
122
|
+
xml.workbook = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
123
|
+
elsif entry.name.match(/^xl.styles\.xml$/) # xl/styles.xml
|
124
|
+
xml.styles = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
125
|
+
elsif entry.name.match(/^xl.sharedStrings\.xml$/i) # xl/sharedStrings.xml
|
126
|
+
# optional feature used by excel, but not often used by xlsx
|
127
|
+
# generation libraries. Path name is sometimes lowercase, too.
|
128
|
+
xml.shared_strings = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
129
|
+
elsif match = entry.name.match(/^xl.worksheets.sheet([0-9]*)\.xml$/)
|
130
|
+
sheet_number = match.captures.first.to_i
|
131
|
+
xml.sheets[sheet_number] =
|
132
|
+
Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
133
|
+
elsif match = entry.name.match(/^xl.worksheets._rels.sheet([0-9]*)\.xml\.rels$/)
|
134
|
+
sheet_number = match.captures.first.to_i
|
135
|
+
xml.sheet_rels[sheet_number] =
|
136
|
+
Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
137
|
+
end
|
138
|
+
end
|
91
139
|
|
92
|
-
|
93
|
-
|
140
|
+
# Sometimes there's a zero-index sheet.xml, ex.
|
141
|
+
# Google Docs creates:
|
142
|
+
#
|
143
|
+
# xl/worksheets/sheet.xml
|
144
|
+
# xl/worksheets/sheet1.xml
|
145
|
+
# xl/worksheets/sheet2.xml
|
146
|
+
# While Excel creates:
|
147
|
+
# xl/worksheets/sheet1.xml
|
148
|
+
# xl/worksheets/sheet2.xml
|
149
|
+
#
|
150
|
+
# So, for the latter case, let's shift [null, <Sheet 1>, <Sheet 2>]
|
151
|
+
if !xml.sheets[0]
|
152
|
+
xml.sheets.shift
|
153
|
+
xml.sheet_rels.shift
|
94
154
|
end
|
95
155
|
end
|
96
156
|
end
|
@@ -105,7 +165,7 @@ module SimpleXlsxReader
|
|
105
165
|
|
106
166
|
def load_sheets
|
107
167
|
sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
|
108
|
-
parse_sheet(sheet_name, xml.sheets[i]) # sheet_number is *not* the index into xml.sheets
|
168
|
+
parse_sheet(sheet_name, xml.sheets[i], xml.sheet_rels[i]) # sheet_number is *not* the index into xml.sheets
|
109
169
|
end
|
110
170
|
end
|
111
171
|
|
@@ -121,9 +181,10 @@ module SimpleXlsxReader
|
|
121
181
|
end
|
122
182
|
end
|
123
183
|
|
124
|
-
def parse_sheet(sheet_name, xsheet)
|
184
|
+
def parse_sheet(sheet_name, xsheet, xrels)
|
125
185
|
sheet = Sheet.new(sheet_name)
|
126
186
|
sheet_width, sheet_height = *sheet_dimensions(xsheet)
|
187
|
+
cells_w_links = xsheet.xpath('//hyperlinks/hyperlink').inject({}) {|acc, e| acc[e.attr(:ref)] = e.attr(:id); acc}
|
127
188
|
|
128
189
|
sheet.rows = Array.new(sheet_height) { Array.new(sheet_width) }
|
129
190
|
xsheet.xpath("/worksheet/sheetData/row/c").each do |xcell|
|
@@ -148,10 +209,21 @@ module SimpleXlsxReader
|
|
148
209
|
# by about 60%. Odd.
|
149
210
|
xvalue = type == 'inlineStr' ?
|
150
211
|
(xis = xcell.children.find {|c| c.name == 'is'}) && xis.children.find {|c| c.name == 't'} :
|
151
|
-
xcell.children.find {|c| c.name == 'v'}
|
212
|
+
xcell.children.find {|c| c.name == 'f' && c.text.start_with?('HYPERLINK(') || c.name == 'v'}
|
213
|
+
|
214
|
+
if xvalue
|
215
|
+
value = xvalue.text.strip
|
216
|
+
|
217
|
+
if rel_id = cells_w_links[xcell.attr('r')] # a hyperlink made via GUI
|
218
|
+
url = xrels.at_xpath(%(//*[@Id="#{rel_id}"])).attr('Target')
|
219
|
+
elsif xvalue.name == 'f' # only time we have a function is if it's a hyperlink
|
220
|
+
url = value.slice(/HYPERLINK\("(.*?)"/, 1)
|
221
|
+
end
|
222
|
+
end
|
152
223
|
|
153
224
|
cell = begin
|
154
|
-
self.class.cast(
|
225
|
+
self.class.cast(value, type, style,
|
226
|
+
:url => url,
|
155
227
|
:shared_strings => shared_strings,
|
156
228
|
:base_date => base_date)
|
157
229
|
rescue => e
|
@@ -218,11 +290,13 @@ module SimpleXlsxReader
|
|
218
290
|
# 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
|
219
291
|
# 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
|
220
292
|
def column_letter_to_number(column_letter)
|
221
|
-
pow = -1
|
222
|
-
|
223
|
-
|
224
|
-
|
293
|
+
pow = column_letter.length - 1
|
294
|
+
result = 0
|
295
|
+
column_letter.each_byte do |b|
|
296
|
+
result += 26**pow * (b - 64)
|
297
|
+
pow -= 1
|
225
298
|
end
|
299
|
+
result
|
226
300
|
end
|
227
301
|
|
228
302
|
# Excel doesn't record types for some cells, only its display style, so
|
@@ -241,21 +315,32 @@ module SimpleXlsxReader
|
|
241
315
|
# type.
|
242
316
|
def style_types
|
243
317
|
@style_types ||=
|
244
|
-
|
245
|
-
|
318
|
+
xml.styles.xpath('/styleSheet/cellXfs/xf').map {|xstyle|
|
319
|
+
style_type_by_num_fmt_id(num_fmt_id(xstyle))}
|
320
|
+
end
|
321
|
+
|
322
|
+
#returns the numFmtId value if it's available
|
323
|
+
def num_fmt_id(xstyle)
|
324
|
+
if xstyle.attributes['numFmtId']
|
325
|
+
xstyle.attributes['numFmtId'].value
|
326
|
+
else
|
327
|
+
nil
|
328
|
+
end
|
246
329
|
end
|
247
330
|
|
248
331
|
# Finds the type we think a style is; For example, fmtId 14 is a date
|
249
|
-
# style, so this would return :date
|
332
|
+
# style, so this would return :date.
|
333
|
+
#
|
334
|
+
# Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
|
335
|
+
# but in practice can sometimes be simply out of the usual "Any Language"
|
336
|
+
# id range that goes up to 49. For example, I have seen a numFmtId of
|
337
|
+
# 59 specified as a date. In Thai, 59 is a number format, so this seems
|
338
|
+
# like a bad idea, but we try to be flexible and just go with it.
|
250
339
|
def style_type_by_num_fmt_id(id)
|
251
340
|
return nil if id.nil?
|
252
341
|
|
253
342
|
id = id.to_i
|
254
|
-
|
255
|
-
custom_style_types[id]
|
256
|
-
else # we should know this one
|
257
|
-
NumFmtMap[id]
|
258
|
-
end
|
343
|
+
NumFmtMap[id] || custom_style_types[id]
|
259
344
|
end
|
260
345
|
|
261
346
|
# Map of (numFmtId >= 164) (custom styles) to our best guess at the type
|
@@ -314,7 +399,7 @@ module SimpleXlsxReader
|
|
314
399
|
type = style
|
315
400
|
end
|
316
401
|
|
317
|
-
case type
|
402
|
+
casted = case type
|
318
403
|
|
319
404
|
##
|
320
405
|
# There are few built-in types
|
@@ -347,15 +432,15 @@ module SimpleXlsxReader
|
|
347
432
|
# the trickiest. note that all these formats can vary on
|
348
433
|
# whether they actually contain a date, time, or datetime.
|
349
434
|
when :date, :time, :date_time
|
350
|
-
|
435
|
+
value = Float(value)
|
436
|
+
days_since_date_system_start = value.to_i
|
437
|
+
fraction_of_24 = value - days_since_date_system_start
|
351
438
|
|
352
439
|
# http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
|
353
|
-
date = options.fetch(:base_date, DATE_SYSTEM_1900) +
|
354
|
-
|
355
|
-
if fraction_of_24 # there is a time associated
|
356
|
-
fraction_of_24 = "0.#{fraction_of_24}".to_f
|
357
|
-
seconds = (fraction_of_24 * 86400).round
|
440
|
+
date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
|
358
441
|
|
442
|
+
if fraction_of_24 > 0 # there is a time associated
|
443
|
+
seconds = (fraction_of_24 * 86400).round
|
359
444
|
return Time.utc(date.year, date.month, date.day) + seconds
|
360
445
|
else
|
361
446
|
return date
|
@@ -374,6 +459,12 @@ module SimpleXlsxReader
|
|
374
459
|
else
|
375
460
|
value
|
376
461
|
end
|
462
|
+
|
463
|
+
if options[:url]
|
464
|
+
Hyperlink.new(options[:url], casted)
|
465
|
+
else
|
466
|
+
casted
|
467
|
+
end
|
377
468
|
end
|
378
469
|
|
379
470
|
## Returns the base_date from which to calculate dates.
|
data/simple_xlsx_reader.gemspec
CHANGED
@@ -7,19 +7,21 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "simple_xlsx_reader"
|
8
8
|
gem.version = SimpleXlsxReader::VERSION
|
9
9
|
gem.authors = ["Woody Peterson"]
|
10
|
-
gem.email = ["woody@
|
10
|
+
gem.email = ["woody.peterson@gmail.com"]
|
11
11
|
gem.description = %q{Read xlsx data the Ruby way}
|
12
12
|
gem.summary = %q{Read xlsx data the Ruby way}
|
13
13
|
gem.homepage = ""
|
14
|
+
gem.license = "MIT"
|
14
15
|
|
15
16
|
gem.add_dependency 'nokogiri'
|
16
17
|
gem.add_dependency 'rubyzip'
|
17
18
|
|
18
19
|
gem.add_development_dependency 'minitest', '>= 5.0'
|
20
|
+
gem.add_development_dependency 'rake'
|
19
21
|
gem.add_development_dependency 'pry'
|
20
22
|
|
21
23
|
gem.files = `git ls-files`.split($/)
|
22
24
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
23
|
-
gem.test_files = gem.files.grep(%r{^
|
25
|
+
gem.test_files = gem.files.grep(%r{^test/})
|
24
26
|
gem.require_paths = ["lib"]
|
25
27
|
end
|
data/test/date1904_test.rb
CHANGED
data/test/datetime_test.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative 'test_helper'
|
2
2
|
|
3
3
|
describe SimpleXlsxReader do
|
4
4
|
let(:datetimes_file) { File.join(File.dirname(__FILE__),
|
@@ -11,7 +11,8 @@ describe SimpleXlsxReader do
|
|
11
11
|
"Datetimes" =>
|
12
12
|
[[Time.parse("2013-08-19 18:29:59 UTC")],
|
13
13
|
[Time.parse("2013-08-19 18:30:00 UTC")],
|
14
|
-
[Time.parse("2013-08-19 18:30:01 UTC")]
|
14
|
+
[Time.parse("2013-08-19 18:30:01 UTC")],
|
15
|
+
[Time.parse("1899-12-30 00:30:00 UTC")]]
|
15
16
|
})
|
16
17
|
end
|
17
18
|
|
data/test/datetimes.xlsx
CHANGED
Binary file
|
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
describe SimpleXlsxReader do
|
5
|
+
let(:one_sheet_file) { File.join(File.dirname(__FILE__), 'gdocs_sheet.xlsx') }
|
6
|
+
let(:subject) { SimpleXlsxReader::Document.new(one_sheet_file) }
|
7
|
+
|
8
|
+
it 'able to load file from google docs' do
|
9
|
+
subject.to_hash.must_equal({
|
10
|
+
"List 1" => [["Empty gdocs list 1"]],
|
11
|
+
"List 2" => [["Empty gdocs list 2"]]
|
12
|
+
})
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
describe SimpleXlsxReader do
|
4
|
+
let(:lower_case_shared_strings) { File.join(File.dirname(__FILE__),
|
5
|
+
'lower_case_sharedstrings.xlsx') }
|
6
|
+
|
7
|
+
let(:subject) { SimpleXlsxReader::Document.new(lower_case_shared_strings) }
|
8
|
+
|
9
|
+
|
10
|
+
describe '#to_hash' do
|
11
|
+
it 'should have the word Well in the first row' do
|
12
|
+
subject.sheets.first.rows[0].must_include('Well')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/test/performance_test.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative 'test_helper'
|
2
2
|
require 'minitest/benchmark'
|
3
3
|
|
4
4
|
describe 'SimpleXlsxReader Benchmark' do
|
@@ -96,13 +96,13 @@ describe 'SimpleXlsxReader Benchmark' do
|
|
96
96
|
bench_exp(1,10000)
|
97
97
|
end
|
98
98
|
|
99
|
-
bench_performance_linear 'parses sheets in linear time', 0.
|
99
|
+
bench_performance_linear 'parses sheets in linear time', 0.999 do |n|
|
100
100
|
|
101
101
|
raise "not enough sample data; asked for #{n}, only have #{@xml.sheets.size}"\
|
102
102
|
if @xml.sheets[n].nil?
|
103
103
|
|
104
104
|
sheet = SimpleXlsxReader::Document::Mapper.new(@xml).
|
105
|
-
parse_sheet('test', @xml.sheets[n])
|
105
|
+
parse_sheet('test', @xml.sheets[n], nil)
|
106
106
|
|
107
107
|
raise "sheet didn't parse correctly; expected #{n + 1} rows, got #{sheet.rows.size}"\
|
108
108
|
if sheet.rows.size != n + 1
|
Binary file
|
@@ -1,26 +1,66 @@
|
|
1
|
-
|
1
|
+
require_relative 'test_helper'
|
2
2
|
require 'time'
|
3
3
|
|
4
|
+
SXR = SimpleXlsxReader
|
5
|
+
|
4
6
|
describe SimpleXlsxReader do
|
5
|
-
let(:
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
"
|
14
|
-
[
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
}
|
7
|
+
let(:sesame_street_blog_file_path) { File.join(File.dirname(__FILE__), 'sesame_street_blog.xlsx') }
|
8
|
+
let(:sesame_street_blog_io) { File.new(sesame_street_blog_file_path) }
|
9
|
+
let(:expected_result) do
|
10
|
+
{
|
11
|
+
"Authors"=>
|
12
|
+
[["Name", "Occupation"],
|
13
|
+
["Big Bird", "Teacher"]],
|
14
|
+
"Posts"=>
|
15
|
+
[["Author Name", "Title", "Body", "Created At", "Comment Count", "URL"],
|
16
|
+
["Big Bird", "The Number 1", "The Greatest", Time.parse("2002-01-01 11:00:00 UTC"), 1, SXR::Hyperlink.new("http://www.example.com/hyperlink-function", "This uses the HYPERLINK() function")],
|
17
|
+
["Big Bird", "The Number 2", "Second Best", Time.parse("2002-01-02 14:00:00 UTC"), 2, SXR::Hyperlink.new("http://www.example.com/hyperlink-gui", "This uses the hyperlink GUI option")],
|
18
|
+
["Big Bird", "Formula Dates", "Tricky tricky", Time.parse("2002-01-03 14:00:00 UTC"), 0, nil],
|
19
|
+
["Empty Eagress", nil, "The title, date, and comment have types, but no values", nil, nil, nil]]
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
describe SimpleXlsxReader do
|
24
|
+
describe 'load from file path' do
|
25
|
+
let(:subject) { SimpleXlsxReader.open(sesame_street_blog_file_path) }
|
26
|
+
|
27
|
+
it 'reads an xlsx file into a hash of {[sheet name] => [data]}' do
|
28
|
+
subject.to_hash.must_equal(expected_result)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe 'load from buffer' do
|
33
|
+
let(:subject) { SimpleXlsxReader.parse(sesame_street_blog_io) }
|
34
|
+
|
35
|
+
it 'reads an xlsx buffer into a hash of {[sheet name] => [data]}' do
|
36
|
+
subject.to_hash.must_equal(expected_result)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe SimpleXlsxReader::Document do
|
42
|
+
describe 'load from file path' do
|
43
|
+
let(:subject) { SimpleXlsxReader::Document.new(file_path: sesame_street_blog_file_path) }
|
44
|
+
|
45
|
+
it 'reads an xlsx file into a hash of {[sheet name] => [data]}' do
|
46
|
+
subject.to_hash.must_equal(expected_result)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'load from buffer' do
|
51
|
+
let(:subject) { SimpleXlsxReader::Document.new(string_or_io: sesame_street_blog_io) }
|
52
|
+
|
53
|
+
it 'reads an xlsx buffer into a hash of {[sheet name] => [data]}' do
|
54
|
+
subject.to_hash.must_equal(expected_result)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe 'load from file path (legacy API)' do
|
59
|
+
let(:subject) { SimpleXlsxReader::Document.new(sesame_street_blog_file_path) }
|
60
|
+
|
61
|
+
it 'reads an xlsx file into a hash of {[sheet name] => [data]}' do
|
62
|
+
subject.to_hash.must_equal(expected_result)
|
63
|
+
end
|
24
64
|
end
|
25
65
|
end
|
26
66
|
|
@@ -63,10 +103,33 @@ describe SimpleXlsxReader do
|
|
63
103
|
must_equal Time.parse('2013-08-19 18:30 UTC')
|
64
104
|
end
|
65
105
|
|
106
|
+
it 'reads less-than-zero complex number types styled as times' do
|
107
|
+
described_class.cast('6.25E-2', 'n', :time).
|
108
|
+
must_equal Time.parse('1899-12-30 01:30:00 UTC')
|
109
|
+
end
|
110
|
+
|
66
111
|
it 'reads number types styled as date_times' do
|
67
112
|
described_class.cast('41505.77083', 'n', :date_time).
|
68
113
|
must_equal Time.parse('2013-08-19 18:30 UTC')
|
69
114
|
end
|
115
|
+
|
116
|
+
it 'raises when date-styled values are not numerical' do
|
117
|
+
lambda { described_class.cast('14 is not a valid date', nil, :date) }.
|
118
|
+
must_raise(ArgumentError)
|
119
|
+
end
|
120
|
+
|
121
|
+
describe "with the url option" do
|
122
|
+
let(:url) { "http://www.example.com/hyperlink" }
|
123
|
+
it 'creates a hyperlink with a string type' do
|
124
|
+
described_class.cast("A link", 'str', :string, url: url).
|
125
|
+
must_equal SXR::Hyperlink.new(url, "A link")
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'creates a hyperlink with a shared string type' do
|
129
|
+
described_class.cast("2", 's', nil, shared_strings: ['a','b','c'], url: url).
|
130
|
+
must_equal SXR::Hyperlink.new(url, 'c')
|
131
|
+
end
|
132
|
+
end
|
70
133
|
end
|
71
134
|
|
72
135
|
describe '#shared_strings' do
|
@@ -102,6 +165,13 @@ describe SimpleXlsxReader do
|
|
102
165
|
|
103
166
|
it 'reads custom formatted styles (numFmtId >= 164)' do
|
104
167
|
mapper.style_types[1].must_equal :date_time
|
168
|
+
mapper.custom_style_types[164].must_equal :date_time
|
169
|
+
end
|
170
|
+
|
171
|
+
# something I've seen in the wild; don't think it's correct, but let's be flexible.
|
172
|
+
it 'reads custom formatted styles given an id < 164, but not explicitly defined in the SpreadsheetML spec' do
|
173
|
+
mapper.style_types[2].must_equal :date_time
|
174
|
+
mapper.custom_style_types[59].must_equal :date_time
|
105
175
|
end
|
106
176
|
end
|
107
177
|
|
@@ -246,16 +316,55 @@ describe SimpleXlsxReader do
|
|
246
316
|
it 'raises if configuration.catch_cell_load_errors' do
|
247
317
|
SimpleXlsxReader.configuration.catch_cell_load_errors = false
|
248
318
|
|
249
|
-
lambda { described_class.new(xml).parse_sheet('test', xml.sheets.first) }.
|
319
|
+
lambda { described_class.new(xml).parse_sheet('test', xml.sheets.first, nil) }.
|
250
320
|
must_raise(SimpleXlsxReader::CellLoadError)
|
251
321
|
end
|
252
322
|
|
253
323
|
it 'records a load error if not configuration.catch_cell_load_errors' do
|
254
324
|
SimpleXlsxReader.configuration.catch_cell_load_errors = true
|
255
325
|
|
256
|
-
sheet = described_class.new(xml).parse_sheet('test', xml.sheets.first)
|
257
|
-
sheet.load_errors[[0,0]].must_include 'invalid value for
|
326
|
+
sheet = described_class.new(xml).parse_sheet('test', xml.sheets.first, nil)
|
327
|
+
sheet.load_errors[[0,0]].must_include 'invalid value for Float'
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
describe "missing numFmtId attributes" do
|
332
|
+
|
333
|
+
let(:xml) do
|
334
|
+
SimpleXlsxReader::Document::Xml.new.tap do |xml|
|
335
|
+
xml.sheets = [Nokogiri::XML(
|
336
|
+
<<-XML
|
337
|
+
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
338
|
+
<dimension ref="A1:A1" />
|
339
|
+
<sheetData>
|
340
|
+
<row>
|
341
|
+
<c r='A1' s='s'>
|
342
|
+
<v>some content</v>
|
343
|
+
</c>
|
344
|
+
</row>
|
345
|
+
</sheetData>
|
346
|
+
</worksheet>
|
347
|
+
XML
|
348
|
+
).remove_namespaces!]
|
349
|
+
|
350
|
+
xml.styles = Nokogiri::XML(
|
351
|
+
<<-XML
|
352
|
+
<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
353
|
+
|
354
|
+
</styleSheet>
|
355
|
+
XML
|
356
|
+
).remove_namespaces!
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
before do
|
361
|
+
@row = described_class.new(xml).parse_sheet('test', xml.sheets.first, nil).rows[0]
|
258
362
|
end
|
363
|
+
|
364
|
+
it 'continues even when cells are missing numFmtId attributes ' do
|
365
|
+
@row[0].must_equal 'some content'
|
366
|
+
end
|
367
|
+
|
259
368
|
end
|
260
369
|
|
261
370
|
describe 'parsing types' do
|
@@ -284,8 +393,21 @@ describe SimpleXlsxReader do
|
|
284
393
|
<c r='G1' t='inlineStr' s='0'>
|
285
394
|
<is><t>Cell G1</t></is>
|
286
395
|
</c>
|
396
|
+
|
397
|
+
<c r='H1' s='0'>
|
398
|
+
<f>HYPERLINK("http://www.example.com/hyperlink-function", "HYPERLINK function")</f>
|
399
|
+
<v>HYPERLINK function</v>
|
400
|
+
</c>
|
401
|
+
|
402
|
+
<c r='I1' s='0'>
|
403
|
+
<v>GUI-made hyperlink</v>
|
404
|
+
</c>
|
287
405
|
</row>
|
288
406
|
</sheetData>
|
407
|
+
|
408
|
+
<hyperlinks>
|
409
|
+
<hyperlink ref="I1" id="rId1"/>
|
410
|
+
</hyperlinks>
|
289
411
|
</worksheet>
|
290
412
|
XML
|
291
413
|
).remove_namespaces!]
|
@@ -303,11 +425,28 @@ describe SimpleXlsxReader do
|
|
303
425
|
</styleSheet>
|
304
426
|
XML
|
305
427
|
).remove_namespaces!
|
428
|
+
|
429
|
+
# Although not a "type" or "style" according to xlsx spec,
|
430
|
+
# it sure could/should be, so let's test it with the rest of our
|
431
|
+
# typecasting code.
|
432
|
+
xml.sheet_rels = [Nokogiri::XML(
|
433
|
+
<<-XML
|
434
|
+
<Relationships>
|
435
|
+
<Relationship
|
436
|
+
Id="rId1"
|
437
|
+
Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
|
438
|
+
Target="http://www.example.com/hyperlink-gui"
|
439
|
+
TargetMode="External"
|
440
|
+
/>
|
441
|
+
</Relationships>
|
442
|
+
XML
|
443
|
+
).remove_namespaces!]
|
444
|
+
|
306
445
|
end
|
307
446
|
end
|
308
447
|
|
309
448
|
before do
|
310
|
-
@row = described_class.new(xml).parse_sheet('test', xml.sheets.first).rows[0]
|
449
|
+
@row = described_class.new(xml).parse_sheet('test', xml.sheets.first, xml.sheet_rels.first).rows[0]
|
311
450
|
end
|
312
451
|
|
313
452
|
it "reads 'Generic' cells as strings" do
|
@@ -341,6 +480,18 @@ describe SimpleXlsxReader do
|
|
341
480
|
it "reads strings formatted as inlineStr" do
|
342
481
|
@row[6].must_equal 'Cell G1'
|
343
482
|
end
|
483
|
+
|
484
|
+
it "reads hyperlinks created via HYPERLINK()" do
|
485
|
+
@row[7].must_equal(
|
486
|
+
SXR::Hyperlink.new(
|
487
|
+
"http://www.example.com/hyperlink-function", "HYPERLINK function"))
|
488
|
+
end
|
489
|
+
|
490
|
+
it "reads hyperlinks created via the GUI" do
|
491
|
+
@row[8].must_equal(
|
492
|
+
SXR::Hyperlink.new(
|
493
|
+
"http://www.example.com/hyperlink-gui", "GUI-made hyperlink"))
|
494
|
+
end
|
344
495
|
end
|
345
496
|
|
346
497
|
describe 'parsing documents with blank rows' do
|
@@ -389,7 +540,7 @@ describe SimpleXlsxReader do
|
|
389
540
|
end
|
390
541
|
|
391
542
|
before do
|
392
|
-
@rows = described_class.new(xml).parse_sheet('test', xml.sheets.first).rows
|
543
|
+
@rows = described_class.new(xml).parse_sheet('test', xml.sheets.first, nil).rows
|
393
544
|
end
|
394
545
|
|
395
546
|
it "reads row data despite gaps in row numbering" do
|
data/test/styles.xml
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
2
2
|
<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac" mc:Ignorable="x14ac">
|
3
|
-
<numFmts count="
|
3
|
+
<numFmts count="2">
|
4
|
+
<numFmt numFmtId="59" formatCode="dd/mm/yyyy"/>
|
4
5
|
<numFmt numFmtId="164" formatCode="[$-409]m/d/yy\ h:mm\ AM/PM;@"/>
|
5
6
|
</numFmts>
|
6
7
|
<fonts count="3" x14ac:knownFonts="1">
|
@@ -50,9 +51,10 @@
|
|
50
51
|
<xf numFmtId="0" fontId="1" fillId="0" borderId="0" applyNumberFormat="0" applyFill="0" applyBorder="0" applyAlignment="0" applyProtection="0"/>
|
51
52
|
<xf numFmtId="0" fontId="2" fillId="0" borderId="0" applyNumberFormat="0" applyFill="0" applyBorder="0" applyAlignment="0" applyProtection="0"/>
|
52
53
|
</cellStyleXfs>
|
53
|
-
<cellXfs count="
|
54
|
+
<cellXfs count="4">
|
54
55
|
<xf numFmtId="0" fontId="0" fillId="0" borderId="0" xfId="0"/>
|
55
56
|
<xf numFmtId="164" fontId="0" fillId="0" borderId="0" xfId="0" applyNumberFormat="1"/>
|
57
|
+
<xf numFmtId="59" fontId="0" fillId="0" borderId="0" xfId="0" applyNumberFormat="1"/>
|
56
58
|
<xf numFmtId="1" fontId="0" fillId="0" borderId="0" xfId="0" applyNumberFormat="1"/>
|
57
59
|
</cellXfs>
|
58
60
|
<cellStyles count="3">
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_xlsx_reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Woody Peterson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: pry
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -68,12 +82,13 @@ dependencies:
|
|
68
82
|
version: '0'
|
69
83
|
description: Read xlsx data the Ruby way
|
70
84
|
email:
|
71
|
-
- woody@
|
85
|
+
- woody.peterson@gmail.com
|
72
86
|
executables: []
|
73
87
|
extensions: []
|
74
88
|
extra_rdoc_files: []
|
75
89
|
files:
|
76
90
|
- ".gitignore"
|
91
|
+
- ".travis.yml"
|
77
92
|
- CHANGELOG.md
|
78
93
|
- Gemfile
|
79
94
|
- LICENSE.txt
|
@@ -86,6 +101,10 @@ files:
|
|
86
101
|
- test/date1904_test.rb
|
87
102
|
- test/datetime_test.rb
|
88
103
|
- test/datetimes.xlsx
|
104
|
+
- test/gdocs_sheet.xlsx
|
105
|
+
- test/gdocs_sheet_test.rb
|
106
|
+
- test/lower_case_sharedstrings.xlsx
|
107
|
+
- test/lower_case_sharedstrings_test.rb
|
89
108
|
- test/performance_test.rb
|
90
109
|
- test/sesame_street_blog.xlsx
|
91
110
|
- test/shared_strings.xml
|
@@ -93,7 +112,8 @@ files:
|
|
93
112
|
- test/styles.xml
|
94
113
|
- test/test_helper.rb
|
95
114
|
homepage: ''
|
96
|
-
licenses:
|
115
|
+
licenses:
|
116
|
+
- MIT
|
97
117
|
metadata: {}
|
98
118
|
post_install_message:
|
99
119
|
rdoc_options: []
|
@@ -110,8 +130,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
130
|
- !ruby/object:Gem::Version
|
111
131
|
version: '0'
|
112
132
|
requirements: []
|
113
|
-
|
114
|
-
rubygems_version: 2.2.0
|
133
|
+
rubygems_version: 3.1.6
|
115
134
|
signing_key:
|
116
135
|
specification_version: 4
|
117
136
|
summary: Read xlsx data the Ruby way
|
@@ -120,6 +139,10 @@ test_files:
|
|
120
139
|
- test/date1904_test.rb
|
121
140
|
- test/datetime_test.rb
|
122
141
|
- test/datetimes.xlsx
|
142
|
+
- test/gdocs_sheet.xlsx
|
143
|
+
- test/gdocs_sheet_test.rb
|
144
|
+
- test/lower_case_sharedstrings.xlsx
|
145
|
+
- test/lower_case_sharedstrings_test.rb
|
123
146
|
- test/performance_test.rb
|
124
147
|
- test/sesame_street_blog.xlsx
|
125
148
|
- test/shared_strings.xml
|