simple_xlsx_reader 1.0.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +38 -0
- data/CHANGELOG.md +7 -0
- data/README.md +190 -64
- data/Rakefile +3 -1
- data/lib/simple_xlsx_reader/document.rb +147 -0
- data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
- data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
- data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
- data/lib/simple_xlsx_reader/loader.rb +199 -0
- data/lib/simple_xlsx_reader/version.rb +3 -1
- data/lib/simple_xlsx_reader.rb +23 -519
- data/test/date1904_test.rb +5 -4
- data/test/datetime_test.rb +17 -10
- data/test/gdocs_sheet_test.rb +6 -5
- data/test/lower_case_sharedstrings_test.rb +9 -4
- data/test/performance_test.rb +85 -88
- data/test/shared_strings.xml +4 -0
- data/test/simple_xlsx_reader_test.rb +785 -375
- data/test/test_helper.rb +4 -1
- data/test/test_xlsx_builder.rb +104 -0
- metadata +16 -6
data/lib/simple_xlsx_reader.rb
CHANGED
@@ -1,7 +1,18 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'date'
|
4
5
|
|
6
|
+
require 'simple_xlsx_reader/version'
|
7
|
+
require 'simple_xlsx_reader/hyperlink'
|
8
|
+
require 'simple_xlsx_reader/document'
|
9
|
+
require 'simple_xlsx_reader/loader'
|
10
|
+
require 'simple_xlsx_reader/loader/workbook_parser'
|
11
|
+
require 'simple_xlsx_reader/loader/shared_strings_parser'
|
12
|
+
require 'simple_xlsx_reader/loader/sheet_parser'
|
13
|
+
require 'simple_xlsx_reader/loader/style_types_parser'
|
14
|
+
|
15
|
+
|
5
16
|
# Rubyzip 1.0 only has different naming, everything else is the same, so let's
|
6
17
|
# be flexible so we don't force people into a dependency hell w/ other gems.
|
7
18
|
begin
|
@@ -17,529 +28,22 @@ rescue LoadError
|
|
17
28
|
end
|
18
29
|
|
19
30
|
module SimpleXlsxReader
|
20
|
-
|
21
|
-
|
22
|
-
# We support hyperlinks as a "type" even though they're technically
|
23
|
-
# represented either as a function or an external reference in the xlsx spec.
|
24
|
-
#
|
25
|
-
# Since having hyperlink data in our sheet usually means we might want to do
|
26
|
-
# something primarily with the URL (store it in the database, download it, etc),
|
27
|
-
# we go through extra effort to parse the function or follow the reference
|
28
|
-
# to represent the hyperlink primarily as a URL. However, maybe we do want
|
29
|
-
# the hyperlink "friendly name" part (as MS calls it), so here we've subclassed
|
30
|
-
# string to tack on the friendly name. This means 80% of us that just want
|
31
|
-
# the URL value will have to do nothing extra, but the 20% that might want the
|
32
|
-
# friendly name can access it.
|
33
|
-
#
|
34
|
-
# Note, by default, the value we would get by just asking the cell would
|
35
|
-
# be the "friendly name" and *not* the URL, which is tucked away in the
|
36
|
-
# function definition or a separate "relationships" meta-document.
|
37
|
-
#
|
38
|
-
# See MS documentation on the HYPERLINK function for some background:
|
39
|
-
# https://support.office.com/en-us/article/HYPERLINK-function-333c7ce6-c5ae-4164-9c47-7de9b76f577f
|
40
|
-
class Hyperlink < String
|
41
|
-
attr_reader :friendly_name
|
42
|
-
|
43
|
-
def initialize(url, friendly_name = nil)
|
44
|
-
@friendly_name = friendly_name
|
45
|
-
super(url)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def self.configuration
|
50
|
-
@configuration ||= Struct.new(:catch_cell_load_errors).new.tap do |c|
|
51
|
-
c.catch_cell_load_errors = false
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def self.open(file_path)
|
56
|
-
Document.new(file_path: file_path).tap(&:sheets)
|
57
|
-
end
|
58
|
-
|
59
|
-
def self.parse(string_or_io)
|
60
|
-
Document.new(string_or_io: string_or_io).tap(&:sheets)
|
61
|
-
end
|
62
|
-
|
63
|
-
class Document
|
64
|
-
attr_reader :string_or_io
|
65
|
-
|
66
|
-
def initialize(legacy_file_path = nil, file_path: nil, string_or_io: nil)
|
67
|
-
((file_path || legacy_file_path).nil? ^ string_or_io.nil?) ||
|
68
|
-
fail(ArgumentError, 'either file_path or string_or_io must be provided')
|
69
|
-
|
70
|
-
@string_or_io = string_or_io || File.new(file_path || legacy_file_path)
|
71
|
-
end
|
72
|
-
|
73
|
-
def sheets
|
74
|
-
@sheets ||= Mapper.new(xml).load_sheets
|
75
|
-
end
|
31
|
+
DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
|
32
|
+
DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
|
76
33
|
|
77
|
-
|
78
|
-
sheets.inject({}) {|acc, sheet| acc[sheet.name] = sheet.rows; acc}
|
79
|
-
end
|
80
|
-
|
81
|
-
def xml
|
82
|
-
Xml.load(string_or_io)
|
83
|
-
end
|
84
|
-
|
85
|
-
class Sheet < Struct.new(:name, :rows)
|
86
|
-
def headers
|
87
|
-
rows[0]
|
88
|
-
end
|
89
|
-
|
90
|
-
def data
|
91
|
-
rows[1..-1]
|
92
|
-
end
|
93
|
-
|
94
|
-
# Load errors will be a hash of the form:
|
95
|
-
# {
|
96
|
-
# [rownum, colnum] => '[error]'
|
97
|
-
# }
|
98
|
-
def load_errors
|
99
|
-
@load_errors ||= {}
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
##
|
104
|
-
# For internal use; stores source xml in nokogiri documents
|
105
|
-
class Xml
|
106
|
-
attr_accessor :workbook, :shared_strings, :sheets, :sheet_rels, :styles
|
107
|
-
|
108
|
-
def self.load(string_or_io)
|
109
|
-
self.new.tap do |xml|
|
110
|
-
SimpleXlsxReader::Zip.open_buffer(string_or_io) do |zip|
|
111
|
-
xml.sheets = []
|
112
|
-
xml.sheet_rels = []
|
113
|
-
|
114
|
-
# This weird style of enumerating over the entries lets us
|
115
|
-
# concisely assign entries in a case insensitive and
|
116
|
-
# slash insensitive ('/' vs '\') manner.
|
117
|
-
#
|
118
|
-
# RubyZip used to normalize the slashes, but doesn't now:
|
119
|
-
# https://github.com/rubyzip/rubyzip/issues/324
|
120
|
-
zip.entries.each do |entry|
|
121
|
-
if entry.name.match(/^xl.workbook\.xml$/) # xl/workbook.xml
|
122
|
-
xml.workbook = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
123
|
-
elsif entry.name.match(/^xl.styles\.xml$/) # xl/styles.xml
|
124
|
-
xml.styles = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
125
|
-
elsif entry.name.match(/^xl.sharedStrings\.xml$/i) # xl/sharedStrings.xml
|
126
|
-
# optional feature used by excel, but not often used by xlsx
|
127
|
-
# generation libraries. Path name is sometimes lowercase, too.
|
128
|
-
xml.shared_strings = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
129
|
-
elsif match = entry.name.match(/^xl.worksheets.sheet([0-9]*)\.xml$/)
|
130
|
-
sheet_number = match.captures.first.to_i
|
131
|
-
xml.sheets[sheet_number] =
|
132
|
-
Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
133
|
-
elsif match = entry.name.match(/^xl.worksheets._rels.sheet([0-9]*)\.xml\.rels$/)
|
134
|
-
sheet_number = match.captures.first.to_i
|
135
|
-
xml.sheet_rels[sheet_number] =
|
136
|
-
Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
137
|
-
end
|
138
|
-
end
|
34
|
+
class CellLoadError < StandardError; end
|
139
35
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
# xl/worksheets/sheet2.xml
|
146
|
-
# While Excel creates:
|
147
|
-
# xl/worksheets/sheet1.xml
|
148
|
-
# xl/worksheets/sheet2.xml
|
149
|
-
#
|
150
|
-
# So, for the latter case, let's shift [null, <Sheet 1>, <Sheet 2>]
|
151
|
-
if !xml.sheets[0]
|
152
|
-
xml.sheets.shift
|
153
|
-
xml.sheet_rels.shift
|
154
|
-
end
|
155
|
-
end
|
156
|
-
end
|
36
|
+
class << self
|
37
|
+
def configuration
|
38
|
+
@configuration ||= Struct.new(:catch_cell_load_errors, :auto_slurp).new.tap do |c|
|
39
|
+
c.catch_cell_load_errors = false
|
40
|
+
c.auto_slurp = false
|
157
41
|
end
|
158
42
|
end
|
159
43
|
|
160
|
-
|
161
|
-
|
162
|
-
class Mapper < Struct.new(:xml)
|
163
|
-
DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
|
164
|
-
DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
|
165
|
-
|
166
|
-
def load_sheets
|
167
|
-
sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
|
168
|
-
parse_sheet(sheet_name, xml.sheets[i], xml.sheet_rels[i]) # sheet_number is *not* the index into xml.sheets
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
# Table of contents for the sheets, ex. {'Authors' => 0, ...}
|
173
|
-
def sheet_toc
|
174
|
-
xml.workbook.xpath('/workbook/sheets/sheet').
|
175
|
-
inject({}) do |acc, sheet|
|
176
|
-
|
177
|
-
acc[sheet.attributes['name'].value] =
|
178
|
-
sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
|
179
|
-
|
180
|
-
acc
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
def parse_sheet(sheet_name, xsheet, xrels)
|
185
|
-
sheet = Sheet.new(sheet_name)
|
186
|
-
sheet_width, sheet_height = *sheet_dimensions(xsheet)
|
187
|
-
cells_w_links = xsheet.xpath('//hyperlinks/hyperlink').inject({}) {|acc, e| acc[e.attr(:ref)] = e.attr(:id); acc}
|
188
|
-
|
189
|
-
sheet.rows = Array.new(sheet_height) { Array.new(sheet_width) }
|
190
|
-
xsheet.xpath("/worksheet/sheetData/row/c").each do |xcell|
|
191
|
-
column, row = *xcell.attr('r').match(/([A-Z]+)([0-9]+)/).captures
|
192
|
-
col_idx = column_letter_to_number(column) - 1
|
193
|
-
row_idx = row.to_i - 1
|
194
|
-
|
195
|
-
type = xcell.attributes['t'] &&
|
196
|
-
xcell.attributes['t'].value
|
197
|
-
style = xcell.attributes['s'] &&
|
198
|
-
style_types[xcell.attributes['s'].value.to_i]
|
199
|
-
|
200
|
-
# This is the main performance bottleneck. Using just 'xcell.text'
|
201
|
-
# would be ideal, and makes parsing super-fast. However, there's
|
202
|
-
# other junk in the cell, formula references in particular,
|
203
|
-
# so we really do have to look for specific value nodes.
|
204
|
-
# Maybe there is a really clever way to use xcell.text and parse out
|
205
|
-
# the correct value, but I can't think of one, or an alternative
|
206
|
-
# strategy.
|
207
|
-
#
|
208
|
-
# And yes, this really is faster than using xcell.at_xpath(...),
|
209
|
-
# by about 60%. Odd.
|
210
|
-
xvalue = type == 'inlineStr' ?
|
211
|
-
(xis = xcell.children.find {|c| c.name == 'is'}) && xis.children.find {|c| c.name == 't'} :
|
212
|
-
xcell.children.find {|c| c.name == 'f' && c.text.start_with?('HYPERLINK(') || c.name == 'v'}
|
213
|
-
|
214
|
-
if xvalue
|
215
|
-
value = xvalue.text.strip
|
216
|
-
|
217
|
-
if rel_id = cells_w_links[xcell.attr('r')] # a hyperlink made via GUI
|
218
|
-
url = xrels.at_xpath(%(//*[@Id="#{rel_id}"])).attr('Target')
|
219
|
-
elsif xvalue.name == 'f' # only time we have a function is if it's a hyperlink
|
220
|
-
url = value.slice(/HYPERLINK\("(.*?)"/, 1)
|
221
|
-
end
|
222
|
-
end
|
223
|
-
|
224
|
-
cell = begin
|
225
|
-
self.class.cast(value, type, style,
|
226
|
-
:url => url,
|
227
|
-
:shared_strings => shared_strings,
|
228
|
-
:base_date => base_date)
|
229
|
-
rescue => e
|
230
|
-
if !SimpleXlsxReader.configuration.catch_cell_load_errors
|
231
|
-
error = CellLoadError.new(
|
232
|
-
"Row #{row_idx}, Col #{col_idx}: #{e.message}")
|
233
|
-
error.set_backtrace(e.backtrace)
|
234
|
-
raise error
|
235
|
-
else
|
236
|
-
sheet.load_errors[[row_idx, col_idx]] = e.message
|
237
|
-
|
238
|
-
xcell.text.strip
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
# This shouldn't be necessary, but just in case, we'll create
|
243
|
-
# the row so we don't blow up. This means any null rows in between
|
244
|
-
# will be null instead of [null, null, ...]
|
245
|
-
sheet.rows[row_idx] ||= Array.new(sheet_width)
|
246
|
-
|
247
|
-
sheet.rows[row_idx][col_idx] = cell
|
248
|
-
end
|
249
|
-
|
250
|
-
sheet
|
251
|
-
end
|
252
|
-
|
253
|
-
##
|
254
|
-
# Returns the last column name, ex. 'E'
|
255
|
-
#
|
256
|
-
# Note that excel writes a '/worksheet/dimension' node we can get the
|
257
|
-
# last cell from, but some libs (ex. simple_xlsx_writer) don't record
|
258
|
-
# this. In that case, we assume the data is of uniform column length
|
259
|
-
# and check the column name of the last header row. Obviously this isn't
|
260
|
-
# the most robust strategy, but it likely fits 99% of use cases
|
261
|
-
# considering it's not a problem with actual excel docs.
|
262
|
-
def last_cell_label(xsheet)
|
263
|
-
dimension = xsheet.at_xpath('/worksheet/dimension')
|
264
|
-
if dimension
|
265
|
-
col = dimension.attributes['ref'].value.match(/:([A-Z]+[0-9]+)/)
|
266
|
-
col ? col.captures.first : 'A1'
|
267
|
-
else
|
268
|
-
last = xsheet.at_xpath("/worksheet/sheetData/row[last()]/c[last()]")
|
269
|
-
last ? last.attributes['r'].value.match(/([A-Z]+[0-9]+)/).captures.first : 'A1'
|
270
|
-
end
|
271
|
-
end
|
272
|
-
|
273
|
-
# Returns dimensions (1-indexed)
|
274
|
-
def sheet_dimensions(xsheet)
|
275
|
-
column, row = *last_cell_label(xsheet).match(/([A-Z]+)([0-9]+)/).captures
|
276
|
-
[column_letter_to_number(column), row.to_i]
|
277
|
-
end
|
278
|
-
|
279
|
-
# formula fits an exponential factorial function of the form:
|
280
|
-
# 'A' = 1
|
281
|
-
# 'B' = 2
|
282
|
-
# 'Z' = 26
|
283
|
-
# 'AA' = 26 * 1 + 1
|
284
|
-
# 'AZ' = 26 * 1 + 26
|
285
|
-
# 'BA' = 26 * 2 + 1
|
286
|
-
# 'ZA' = 26 * 26 + 1
|
287
|
-
# 'ZZ' = 26 * 26 + 26
|
288
|
-
# 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
|
289
|
-
# 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
|
290
|
-
# 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
|
291
|
-
# 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
|
292
|
-
def column_letter_to_number(column_letter)
|
293
|
-
pow = column_letter.length - 1
|
294
|
-
result = 0
|
295
|
-
column_letter.each_byte do |b|
|
296
|
-
result += 26**pow * (b - 64)
|
297
|
-
pow -= 1
|
298
|
-
end
|
299
|
-
result
|
300
|
-
end
|
301
|
-
|
302
|
-
# Excel doesn't record types for some cells, only its display style, so
|
303
|
-
# we have to back out the type from that style.
|
304
|
-
#
|
305
|
-
# Some of these styles can be determined from a known set (see NumFmtMap),
|
306
|
-
# while others are 'custom' and we have to make a best guess.
|
307
|
-
#
|
308
|
-
# This is the array of types corresponding to the styles a spreadsheet
|
309
|
-
# uses, and includes both the known style types and the custom styles.
|
310
|
-
#
|
311
|
-
# Note that the xml sheet cells that use this don't reference the
|
312
|
-
# numFmtId, but instead the array index of a style in the stored list of
|
313
|
-
# only the styles used in the spreadsheet (which can be either known or
|
314
|
-
# custom). Hence this style types array, rather than a map of numFmtId to
|
315
|
-
# type.
|
316
|
-
def style_types
|
317
|
-
@style_types ||=
|
318
|
-
xml.styles.xpath('/styleSheet/cellXfs/xf').map {|xstyle|
|
319
|
-
style_type_by_num_fmt_id(num_fmt_id(xstyle))}
|
320
|
-
end
|
321
|
-
|
322
|
-
#returns the numFmtId value if it's available
|
323
|
-
def num_fmt_id(xstyle)
|
324
|
-
if xstyle.attributes['numFmtId']
|
325
|
-
xstyle.attributes['numFmtId'].value
|
326
|
-
else
|
327
|
-
nil
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
# Finds the type we think a style is; For example, fmtId 14 is a date
|
332
|
-
# style, so this would return :date.
|
333
|
-
#
|
334
|
-
# Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
|
335
|
-
# but in practice can sometimes be simply out of the usual "Any Language"
|
336
|
-
# id range that goes up to 49. For example, I have seen a numFmtId of
|
337
|
-
# 59 specified as a date. In Thai, 59 is a number format, so this seems
|
338
|
-
# like a bad idea, but we try to be flexible and just go with it.
|
339
|
-
def style_type_by_num_fmt_id(id)
|
340
|
-
return nil if id.nil?
|
341
|
-
|
342
|
-
id = id.to_i
|
343
|
-
NumFmtMap[id] || custom_style_types[id]
|
344
|
-
end
|
345
|
-
|
346
|
-
# Map of (numFmtId >= 164) (custom styles) to our best guess at the type
|
347
|
-
# ex. {164 => :date_time}
|
348
|
-
def custom_style_types
|
349
|
-
@custom_style_types ||=
|
350
|
-
xml.styles.xpath('/styleSheet/numFmts/numFmt').
|
351
|
-
inject({}) do |acc, xstyle|
|
352
|
-
|
353
|
-
acc[xstyle.attributes['numFmtId'].value.to_i] =
|
354
|
-
determine_custom_style_type(xstyle.attributes['formatCode'].value)
|
355
|
-
|
356
|
-
acc
|
357
|
-
end
|
358
|
-
end
|
359
|
-
|
360
|
-
# This is the least deterministic part of reading xlsx files. Due to
|
361
|
-
# custom styles, you can't know for sure when a date is a date other than
|
362
|
-
# looking at its format and gessing. It's not impossible to guess right,
|
363
|
-
# though.
|
364
|
-
#
|
365
|
-
# http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
|
366
|
-
def determine_custom_style_type(string)
|
367
|
-
return :float if string[0] == '_'
|
368
|
-
return :float if string[0] == ' 0'
|
369
|
-
|
370
|
-
# Looks for one of ymdhis outside of meta-stuff like [Red]
|
371
|
-
return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
|
372
|
-
|
373
|
-
return :unsupported
|
374
|
-
end
|
375
|
-
|
376
|
-
##
|
377
|
-
# The heart of typecasting. The ruby type is determined either explicitly
|
378
|
-
# from the cell xml or implicitly from the cell style, and this
|
379
|
-
# method expects that work to have been done already. This, then,
|
380
|
-
# takes the type we determined it to be and casts the cell value
|
381
|
-
# to that type.
|
382
|
-
#
|
383
|
-
# types:
|
384
|
-
# - s: shared string (see #shared_string)
|
385
|
-
# - n: number (cast to a float)
|
386
|
-
# - b: boolean
|
387
|
-
# - str: string
|
388
|
-
# - inlineStr: string
|
389
|
-
# - ruby symbol: for when type has been determined by style
|
390
|
-
#
|
391
|
-
# options:
|
392
|
-
# - shared_strings: needed for 's' (shared string) type
|
393
|
-
def self.cast(value, type, style, options = {})
|
394
|
-
return nil if value.nil? || value.empty?
|
395
|
-
|
396
|
-
# Sometimes the type is dictated by the style alone
|
397
|
-
if type.nil? ||
|
398
|
-
(type == 'n' && [:date, :time, :date_time].include?(style))
|
399
|
-
type = style
|
400
|
-
end
|
401
|
-
|
402
|
-
casted = case type
|
403
|
-
|
404
|
-
##
|
405
|
-
# There are few built-in types
|
406
|
-
##
|
407
|
-
|
408
|
-
when 's' # shared string
|
409
|
-
options[:shared_strings][value.to_i]
|
410
|
-
when 'n' # number
|
411
|
-
value.to_f
|
412
|
-
when 'b'
|
413
|
-
value.to_i == 1
|
414
|
-
when 'str'
|
415
|
-
value
|
416
|
-
when 'inlineStr'
|
417
|
-
value
|
418
|
-
|
419
|
-
##
|
420
|
-
# Type can also be determined by a style,
|
421
|
-
# detected earlier and cast here by its standardized symbol
|
422
|
-
##
|
423
|
-
|
424
|
-
when :string, :unsupported
|
425
|
-
value
|
426
|
-
when :fixnum
|
427
|
-
value.to_i
|
428
|
-
when :float
|
429
|
-
value.to_f
|
430
|
-
when :percentage
|
431
|
-
value.to_f / 100
|
432
|
-
# the trickiest. note that all these formats can vary on
|
433
|
-
# whether they actually contain a date, time, or datetime.
|
434
|
-
when :date, :time, :date_time
|
435
|
-
value = Float(value)
|
436
|
-
days_since_date_system_start = value.to_i
|
437
|
-
fraction_of_24 = value - days_since_date_system_start
|
438
|
-
|
439
|
-
# http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
|
440
|
-
date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
|
441
|
-
|
442
|
-
if fraction_of_24 > 0 # there is a time associated
|
443
|
-
seconds = (fraction_of_24 * 86400).round
|
444
|
-
return Time.utc(date.year, date.month, date.day) + seconds
|
445
|
-
else
|
446
|
-
return date
|
447
|
-
end
|
448
|
-
when :bignum
|
449
|
-
if defined?(BigDecimal)
|
450
|
-
BigDecimal.new(value)
|
451
|
-
else
|
452
|
-
value.to_f
|
453
|
-
end
|
454
|
-
|
455
|
-
##
|
456
|
-
# Beats me
|
457
|
-
##
|
458
|
-
|
459
|
-
else
|
460
|
-
value
|
461
|
-
end
|
462
|
-
|
463
|
-
if options[:url]
|
464
|
-
Hyperlink.new(options[:url], casted)
|
465
|
-
else
|
466
|
-
casted
|
467
|
-
end
|
468
|
-
end
|
469
|
-
|
470
|
-
## Returns the base_date from which to calculate dates.
|
471
|
-
# Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
|
472
|
-
# it's set in the Workbook's workbookPr.
|
473
|
-
# http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
|
474
|
-
def base_date
|
475
|
-
@base_date ||=
|
476
|
-
begin
|
477
|
-
return DATE_SYSTEM_1900 if xml.workbook == nil
|
478
|
-
xml.workbook.xpath("//workbook/workbookPr[@date1904]").each do |workbookPr|
|
479
|
-
return DATE_SYSTEM_1904 if workbookPr["date1904"] =~ /true|1/i
|
480
|
-
end
|
481
|
-
DATE_SYSTEM_1900
|
482
|
-
end
|
483
|
-
end
|
484
|
-
|
485
|
-
# Map of non-custom numFmtId to casting symbol
|
486
|
-
NumFmtMap = {
|
487
|
-
0 => :string, # General
|
488
|
-
1 => :fixnum, # 0
|
489
|
-
2 => :float, # 0.00
|
490
|
-
3 => :fixnum, # #,##0
|
491
|
-
4 => :float, # #,##0.00
|
492
|
-
5 => :unsupported, # $#,##0_);($#,##0)
|
493
|
-
6 => :unsupported, # $#,##0_);[Red]($#,##0)
|
494
|
-
7 => :unsupported, # $#,##0.00_);($#,##0.00)
|
495
|
-
8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
|
496
|
-
9 => :percentage, # 0%
|
497
|
-
10 => :percentage, # 0.00%
|
498
|
-
11 => :bignum, # 0.00E+00
|
499
|
-
12 => :unsupported, # # ?/?
|
500
|
-
13 => :unsupported, # # ??/??
|
501
|
-
14 => :date, # mm-dd-yy
|
502
|
-
15 => :date, # d-mmm-yy
|
503
|
-
16 => :date, # d-mmm
|
504
|
-
17 => :date, # mmm-yy
|
505
|
-
18 => :time, # h:mm AM/PM
|
506
|
-
19 => :time, # h:mm:ss AM/PM
|
507
|
-
20 => :time, # h:mm
|
508
|
-
21 => :time, # h:mm:ss
|
509
|
-
22 => :date_time, # m/d/yy h:mm
|
510
|
-
37 => :unsupported, # #,##0 ;(#,##0)
|
511
|
-
38 => :unsupported, # #,##0 ;[Red](#,##0)
|
512
|
-
39 => :unsupported, # #,##0.00;(#,##0.00)
|
513
|
-
40 => :unsupported, # #,##0.00;[Red](#,##0.00)
|
514
|
-
45 => :time, # mm:ss
|
515
|
-
46 => :time, # [h]:mm:ss
|
516
|
-
47 => :time, # mmss.0
|
517
|
-
48 => :bignum, # ##0.0E+0
|
518
|
-
49 => :unsupported # @
|
519
|
-
}
|
520
|
-
|
521
|
-
# For performance reasons, excel uses an optional SpreadsheetML feature
|
522
|
-
# that puts all strings in a separate xml file, and then references
|
523
|
-
# them by their index in that file.
|
524
|
-
#
|
525
|
-
# http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
|
526
|
-
def shared_strings
|
527
|
-
@shared_strings ||= begin
|
528
|
-
if xml.shared_strings
|
529
|
-
xml.shared_strings.xpath('/sst/si').map do |xsst|
|
530
|
-
# a shared string can be a single value...
|
531
|
-
sst = xsst.at_xpath('t/text()')
|
532
|
-
sst = sst.text if sst
|
533
|
-
# ... or a composite of seperately styled words/characters
|
534
|
-
sst ||= xsst.xpath('r/t/text()').map(&:text).join
|
535
|
-
end
|
536
|
-
else
|
537
|
-
[]
|
538
|
-
end
|
539
|
-
end
|
540
|
-
end
|
541
|
-
|
44
|
+
def open(file_path)
|
45
|
+
Document.new(file_path).tap(&:sheets)
|
542
46
|
end
|
543
|
-
|
47
|
+
alias parse open
|
544
48
|
end
|
545
49
|
end
|
data/test/date1904_test.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
|
3
5
|
describe SimpleXlsxReader do
|
@@ -5,9 +7,8 @@ describe SimpleXlsxReader do
|
|
5
7
|
let(:subject) { SimpleXlsxReader::Document.new(date1904_file) }
|
6
8
|
|
7
9
|
it 'supports converting dates with the 1904 date system' do
|
8
|
-
subject.to_hash.must_equal(
|
9
|
-
|
10
|
-
|
10
|
+
_(subject.to_hash).must_equal(
|
11
|
+
'date1904' => [[Date.parse('2014-05-01')]]
|
12
|
+
)
|
11
13
|
end
|
12
|
-
|
13
14
|
end
|
data/test/datetime_test.rb
CHANGED
@@ -1,19 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
|
3
5
|
describe SimpleXlsxReader do
|
4
|
-
let(:datetimes_file)
|
5
|
-
|
6
|
+
let(:datetimes_file) do
|
7
|
+
File.join(
|
8
|
+
File.dirname(__FILE__),
|
9
|
+
'datetimes.xlsx'
|
10
|
+
)
|
11
|
+
end
|
6
12
|
|
7
13
|
let(:subject) { SimpleXlsxReader::Document.new(datetimes_file) }
|
8
14
|
|
9
15
|
it 'converts date_times with the correct precision' do
|
10
|
-
subject.to_hash.must_equal(
|
11
|
-
|
12
|
-
[
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
_(subject.to_hash).must_equal(
|
17
|
+
'Datetimes' =>
|
18
|
+
[
|
19
|
+
[Time.parse('2013-08-19 18:29:59 UTC')],
|
20
|
+
[Time.parse('2013-08-19 18:30:00 UTC')],
|
21
|
+
[Time.parse('2013-08-19 18:30:01 UTC')],
|
22
|
+
[Time.parse('1899-12-30 00:30:00 UTC')]
|
23
|
+
]
|
24
|
+
)
|
17
25
|
end
|
18
|
-
|
19
26
|
end
|
data/test/gdocs_sheet_test.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
require 'time'
|
3
5
|
|
@@ -6,10 +8,9 @@ describe SimpleXlsxReader do
|
|
6
8
|
let(:subject) { SimpleXlsxReader::Document.new(one_sheet_file) }
|
7
9
|
|
8
10
|
it 'able to load file from google docs' do
|
9
|
-
subject.to_hash.must_equal(
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
_(subject.to_hash).must_equal(
|
12
|
+
'List 1' => [['Empty gdocs list 1']],
|
13
|
+
'List 2' => [['Empty gdocs list 2']]
|
14
|
+
)
|
13
15
|
end
|
14
|
-
|
15
16
|
end
|
@@ -1,15 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
|
3
5
|
describe SimpleXlsxReader do
|
4
|
-
let(:lower_case_shared_strings)
|
5
|
-
|
6
|
+
let(:lower_case_shared_strings) do
|
7
|
+
File.join(
|
8
|
+
File.dirname(__FILE__),
|
9
|
+
'lower_case_sharedstrings.xlsx'
|
10
|
+
)
|
11
|
+
end
|
6
12
|
|
7
13
|
let(:subject) { SimpleXlsxReader::Document.new(lower_case_shared_strings) }
|
8
14
|
|
9
|
-
|
10
15
|
describe '#to_hash' do
|
11
16
|
it 'should have the word Well in the first row' do
|
12
|
-
subject.sheets.first.rows[0].must_include('Well')
|
17
|
+
_(subject.sheets.first.rows.to_a[0]).must_include('Well')
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|