simple_xlsx_reader 1.0.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +38 -0
- data/CHANGELOG.md +7 -0
- data/README.md +190 -64
- data/Rakefile +3 -1
- data/lib/simple_xlsx_reader/document.rb +147 -0
- data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
- data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
- data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
- data/lib/simple_xlsx_reader/loader.rb +199 -0
- data/lib/simple_xlsx_reader/version.rb +3 -1
- data/lib/simple_xlsx_reader.rb +23 -519
- data/test/date1904_test.rb +5 -4
- data/test/datetime_test.rb +17 -10
- data/test/gdocs_sheet_test.rb +6 -5
- data/test/lower_case_sharedstrings_test.rb +9 -4
- data/test/performance_test.rb +85 -88
- data/test/shared_strings.xml +4 -0
- data/test/simple_xlsx_reader_test.rb +785 -375
- data/test/test_helper.rb +4 -1
- data/test/test_xlsx_builder.rb +104 -0
- metadata +16 -6
data/lib/simple_xlsx_reader.rb
CHANGED
@@ -1,7 +1,18 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'date'
|
4
5
|
|
6
|
+
require 'simple_xlsx_reader/version'
|
7
|
+
require 'simple_xlsx_reader/hyperlink'
|
8
|
+
require 'simple_xlsx_reader/document'
|
9
|
+
require 'simple_xlsx_reader/loader'
|
10
|
+
require 'simple_xlsx_reader/loader/workbook_parser'
|
11
|
+
require 'simple_xlsx_reader/loader/shared_strings_parser'
|
12
|
+
require 'simple_xlsx_reader/loader/sheet_parser'
|
13
|
+
require 'simple_xlsx_reader/loader/style_types_parser'
|
14
|
+
|
15
|
+
|
5
16
|
# Rubyzip 1.0 only has different naming, everything else is the same, so let's
|
6
17
|
# be flexible so we don't force people into a dependency hell w/ other gems.
|
7
18
|
begin
|
@@ -17,529 +28,22 @@ rescue LoadError
|
|
17
28
|
end
|
18
29
|
|
19
30
|
module SimpleXlsxReader
|
20
|
-
|
21
|
-
|
22
|
-
# We support hyperlinks as a "type" even though they're technically
|
23
|
-
# represented either as a function or an external reference in the xlsx spec.
|
24
|
-
#
|
25
|
-
# Since having hyperlink data in our sheet usually means we might want to do
|
26
|
-
# something primarily with the URL (store it in the database, download it, etc),
|
27
|
-
# we go through extra effort to parse the function or follow the reference
|
28
|
-
# to represent the hyperlink primarily as a URL. However, maybe we do want
|
29
|
-
# the hyperlink "friendly name" part (as MS calls it), so here we've subclassed
|
30
|
-
# string to tack on the friendly name. This means 80% of us that just want
|
31
|
-
# the URL value will have to do nothing extra, but the 20% that might want the
|
32
|
-
# friendly name can access it.
|
33
|
-
#
|
34
|
-
# Note, by default, the value we would get by just asking the cell would
|
35
|
-
# be the "friendly name" and *not* the URL, which is tucked away in the
|
36
|
-
# function definition or a separate "relationships" meta-document.
|
37
|
-
#
|
38
|
-
# See MS documentation on the HYPERLINK function for some background:
|
39
|
-
# https://support.office.com/en-us/article/HYPERLINK-function-333c7ce6-c5ae-4164-9c47-7de9b76f577f
|
40
|
-
class Hyperlink < String
|
41
|
-
attr_reader :friendly_name
|
42
|
-
|
43
|
-
def initialize(url, friendly_name = nil)
|
44
|
-
@friendly_name = friendly_name
|
45
|
-
super(url)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def self.configuration
|
50
|
-
@configuration ||= Struct.new(:catch_cell_load_errors).new.tap do |c|
|
51
|
-
c.catch_cell_load_errors = false
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def self.open(file_path)
|
56
|
-
Document.new(file_path: file_path).tap(&:sheets)
|
57
|
-
end
|
58
|
-
|
59
|
-
def self.parse(string_or_io)
|
60
|
-
Document.new(string_or_io: string_or_io).tap(&:sheets)
|
61
|
-
end
|
62
|
-
|
63
|
-
class Document
|
64
|
-
attr_reader :string_or_io
|
65
|
-
|
66
|
-
def initialize(legacy_file_path = nil, file_path: nil, string_or_io: nil)
|
67
|
-
((file_path || legacy_file_path).nil? ^ string_or_io.nil?) ||
|
68
|
-
fail(ArgumentError, 'either file_path or string_or_io must be provided')
|
69
|
-
|
70
|
-
@string_or_io = string_or_io || File.new(file_path || legacy_file_path)
|
71
|
-
end
|
72
|
-
|
73
|
-
def sheets
|
74
|
-
@sheets ||= Mapper.new(xml).load_sheets
|
75
|
-
end
|
31
|
+
DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
|
32
|
+
DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
|
76
33
|
|
77
|
-
|
78
|
-
sheets.inject({}) {|acc, sheet| acc[sheet.name] = sheet.rows; acc}
|
79
|
-
end
|
80
|
-
|
81
|
-
def xml
|
82
|
-
Xml.load(string_or_io)
|
83
|
-
end
|
84
|
-
|
85
|
-
class Sheet < Struct.new(:name, :rows)
|
86
|
-
def headers
|
87
|
-
rows[0]
|
88
|
-
end
|
89
|
-
|
90
|
-
def data
|
91
|
-
rows[1..-1]
|
92
|
-
end
|
93
|
-
|
94
|
-
# Load errors will be a hash of the form:
|
95
|
-
# {
|
96
|
-
# [rownum, colnum] => '[error]'
|
97
|
-
# }
|
98
|
-
def load_errors
|
99
|
-
@load_errors ||= {}
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
##
|
104
|
-
# For internal use; stores source xml in nokogiri documents
|
105
|
-
class Xml
|
106
|
-
attr_accessor :workbook, :shared_strings, :sheets, :sheet_rels, :styles
|
107
|
-
|
108
|
-
def self.load(string_or_io)
|
109
|
-
self.new.tap do |xml|
|
110
|
-
SimpleXlsxReader::Zip.open_buffer(string_or_io) do |zip|
|
111
|
-
xml.sheets = []
|
112
|
-
xml.sheet_rels = []
|
113
|
-
|
114
|
-
# This weird style of enumerating over the entries lets us
|
115
|
-
# concisely assign entries in a case insensitive and
|
116
|
-
# slash insensitive ('/' vs '\') manner.
|
117
|
-
#
|
118
|
-
# RubyZip used to normalize the slashes, but doesn't now:
|
119
|
-
# https://github.com/rubyzip/rubyzip/issues/324
|
120
|
-
zip.entries.each do |entry|
|
121
|
-
if entry.name.match(/^xl.workbook\.xml$/) # xl/workbook.xml
|
122
|
-
xml.workbook = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
123
|
-
elsif entry.name.match(/^xl.styles\.xml$/) # xl/styles.xml
|
124
|
-
xml.styles = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
125
|
-
elsif entry.name.match(/^xl.sharedStrings\.xml$/i) # xl/sharedStrings.xml
|
126
|
-
# optional feature used by excel, but not often used by xlsx
|
127
|
-
# generation libraries. Path name is sometimes lowercase, too.
|
128
|
-
xml.shared_strings = Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
129
|
-
elsif match = entry.name.match(/^xl.worksheets.sheet([0-9]*)\.xml$/)
|
130
|
-
sheet_number = match.captures.first.to_i
|
131
|
-
xml.sheets[sheet_number] =
|
132
|
-
Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
133
|
-
elsif match = entry.name.match(/^xl.worksheets._rels.sheet([0-9]*)\.xml\.rels$/)
|
134
|
-
sheet_number = match.captures.first.to_i
|
135
|
-
xml.sheet_rels[sheet_number] =
|
136
|
-
Nokogiri::XML(zip.read(entry)).remove_namespaces!
|
137
|
-
end
|
138
|
-
end
|
34
|
+
class CellLoadError < StandardError; end
|
139
35
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
# xl/worksheets/sheet2.xml
|
146
|
-
# While Excel creates:
|
147
|
-
# xl/worksheets/sheet1.xml
|
148
|
-
# xl/worksheets/sheet2.xml
|
149
|
-
#
|
150
|
-
# So, for the latter case, let's shift [null, <Sheet 1>, <Sheet 2>]
|
151
|
-
if !xml.sheets[0]
|
152
|
-
xml.sheets.shift
|
153
|
-
xml.sheet_rels.shift
|
154
|
-
end
|
155
|
-
end
|
156
|
-
end
|
36
|
+
class << self
|
37
|
+
def configuration
|
38
|
+
@configuration ||= Struct.new(:catch_cell_load_errors, :auto_slurp).new.tap do |c|
|
39
|
+
c.catch_cell_load_errors = false
|
40
|
+
c.auto_slurp = false
|
157
41
|
end
|
158
42
|
end
|
159
43
|
|
160
|
-
|
161
|
-
|
162
|
-
class Mapper < Struct.new(:xml)
|
163
|
-
DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
|
164
|
-
DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
|
165
|
-
|
166
|
-
def load_sheets
|
167
|
-
sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
|
168
|
-
parse_sheet(sheet_name, xml.sheets[i], xml.sheet_rels[i]) # sheet_number is *not* the index into xml.sheets
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
# Table of contents for the sheets, ex. {'Authors' => 0, ...}
|
173
|
-
def sheet_toc
|
174
|
-
xml.workbook.xpath('/workbook/sheets/sheet').
|
175
|
-
inject({}) do |acc, sheet|
|
176
|
-
|
177
|
-
acc[sheet.attributes['name'].value] =
|
178
|
-
sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
|
179
|
-
|
180
|
-
acc
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
def parse_sheet(sheet_name, xsheet, xrels)
|
185
|
-
sheet = Sheet.new(sheet_name)
|
186
|
-
sheet_width, sheet_height = *sheet_dimensions(xsheet)
|
187
|
-
cells_w_links = xsheet.xpath('//hyperlinks/hyperlink').inject({}) {|acc, e| acc[e.attr(:ref)] = e.attr(:id); acc}
|
188
|
-
|
189
|
-
sheet.rows = Array.new(sheet_height) { Array.new(sheet_width) }
|
190
|
-
xsheet.xpath("/worksheet/sheetData/row/c").each do |xcell|
|
191
|
-
column, row = *xcell.attr('r').match(/([A-Z]+)([0-9]+)/).captures
|
192
|
-
col_idx = column_letter_to_number(column) - 1
|
193
|
-
row_idx = row.to_i - 1
|
194
|
-
|
195
|
-
type = xcell.attributes['t'] &&
|
196
|
-
xcell.attributes['t'].value
|
197
|
-
style = xcell.attributes['s'] &&
|
198
|
-
style_types[xcell.attributes['s'].value.to_i]
|
199
|
-
|
200
|
-
# This is the main performance bottleneck. Using just 'xcell.text'
|
201
|
-
# would be ideal, and makes parsing super-fast. However, there's
|
202
|
-
# other junk in the cell, formula references in particular,
|
203
|
-
# so we really do have to look for specific value nodes.
|
204
|
-
# Maybe there is a really clever way to use xcell.text and parse out
|
205
|
-
# the correct value, but I can't think of one, or an alternative
|
206
|
-
# strategy.
|
207
|
-
#
|
208
|
-
# And yes, this really is faster than using xcell.at_xpath(...),
|
209
|
-
# by about 60%. Odd.
|
210
|
-
xvalue = type == 'inlineStr' ?
|
211
|
-
(xis = xcell.children.find {|c| c.name == 'is'}) && xis.children.find {|c| c.name == 't'} :
|
212
|
-
xcell.children.find {|c| c.name == 'f' && c.text.start_with?('HYPERLINK(') || c.name == 'v'}
|
213
|
-
|
214
|
-
if xvalue
|
215
|
-
value = xvalue.text.strip
|
216
|
-
|
217
|
-
if rel_id = cells_w_links[xcell.attr('r')] # a hyperlink made via GUI
|
218
|
-
url = xrels.at_xpath(%(//*[@Id="#{rel_id}"])).attr('Target')
|
219
|
-
elsif xvalue.name == 'f' # only time we have a function is if it's a hyperlink
|
220
|
-
url = value.slice(/HYPERLINK\("(.*?)"/, 1)
|
221
|
-
end
|
222
|
-
end
|
223
|
-
|
224
|
-
cell = begin
|
225
|
-
self.class.cast(value, type, style,
|
226
|
-
:url => url,
|
227
|
-
:shared_strings => shared_strings,
|
228
|
-
:base_date => base_date)
|
229
|
-
rescue => e
|
230
|
-
if !SimpleXlsxReader.configuration.catch_cell_load_errors
|
231
|
-
error = CellLoadError.new(
|
232
|
-
"Row #{row_idx}, Col #{col_idx}: #{e.message}")
|
233
|
-
error.set_backtrace(e.backtrace)
|
234
|
-
raise error
|
235
|
-
else
|
236
|
-
sheet.load_errors[[row_idx, col_idx]] = e.message
|
237
|
-
|
238
|
-
xcell.text.strip
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
# This shouldn't be necessary, but just in case, we'll create
|
243
|
-
# the row so we don't blow up. This means any null rows in between
|
244
|
-
# will be null instead of [null, null, ...]
|
245
|
-
sheet.rows[row_idx] ||= Array.new(sheet_width)
|
246
|
-
|
247
|
-
sheet.rows[row_idx][col_idx] = cell
|
248
|
-
end
|
249
|
-
|
250
|
-
sheet
|
251
|
-
end
|
252
|
-
|
253
|
-
##
|
254
|
-
# Returns the last column name, ex. 'E'
|
255
|
-
#
|
256
|
-
# Note that excel writes a '/worksheet/dimension' node we can get the
|
257
|
-
# last cell from, but some libs (ex. simple_xlsx_writer) don't record
|
258
|
-
# this. In that case, we assume the data is of uniform column length
|
259
|
-
# and check the column name of the last header row. Obviously this isn't
|
260
|
-
# the most robust strategy, but it likely fits 99% of use cases
|
261
|
-
# considering it's not a problem with actual excel docs.
|
262
|
-
def last_cell_label(xsheet)
|
263
|
-
dimension = xsheet.at_xpath('/worksheet/dimension')
|
264
|
-
if dimension
|
265
|
-
col = dimension.attributes['ref'].value.match(/:([A-Z]+[0-9]+)/)
|
266
|
-
col ? col.captures.first : 'A1'
|
267
|
-
else
|
268
|
-
last = xsheet.at_xpath("/worksheet/sheetData/row[last()]/c[last()]")
|
269
|
-
last ? last.attributes['r'].value.match(/([A-Z]+[0-9]+)/).captures.first : 'A1'
|
270
|
-
end
|
271
|
-
end
|
272
|
-
|
273
|
-
# Returns dimensions (1-indexed)
|
274
|
-
def sheet_dimensions(xsheet)
|
275
|
-
column, row = *last_cell_label(xsheet).match(/([A-Z]+)([0-9]+)/).captures
|
276
|
-
[column_letter_to_number(column), row.to_i]
|
277
|
-
end
|
278
|
-
|
279
|
-
# formula fits an exponential factorial function of the form:
|
280
|
-
# 'A' = 1
|
281
|
-
# 'B' = 2
|
282
|
-
# 'Z' = 26
|
283
|
-
# 'AA' = 26 * 1 + 1
|
284
|
-
# 'AZ' = 26 * 1 + 26
|
285
|
-
# 'BA' = 26 * 2 + 1
|
286
|
-
# 'ZA' = 26 * 26 + 1
|
287
|
-
# 'ZZ' = 26 * 26 + 26
|
288
|
-
# 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
|
289
|
-
# 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
|
290
|
-
# 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
|
291
|
-
# 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
|
292
|
-
def column_letter_to_number(column_letter)
|
293
|
-
pow = column_letter.length - 1
|
294
|
-
result = 0
|
295
|
-
column_letter.each_byte do |b|
|
296
|
-
result += 26**pow * (b - 64)
|
297
|
-
pow -= 1
|
298
|
-
end
|
299
|
-
result
|
300
|
-
end
|
301
|
-
|
302
|
-
# Excel doesn't record types for some cells, only its display style, so
|
303
|
-
# we have to back out the type from that style.
|
304
|
-
#
|
305
|
-
# Some of these styles can be determined from a known set (see NumFmtMap),
|
306
|
-
# while others are 'custom' and we have to make a best guess.
|
307
|
-
#
|
308
|
-
# This is the array of types corresponding to the styles a spreadsheet
|
309
|
-
# uses, and includes both the known style types and the custom styles.
|
310
|
-
#
|
311
|
-
# Note that the xml sheet cells that use this don't reference the
|
312
|
-
# numFmtId, but instead the array index of a style in the stored list of
|
313
|
-
# only the styles used in the spreadsheet (which can be either known or
|
314
|
-
# custom). Hence this style types array, rather than a map of numFmtId to
|
315
|
-
# type.
|
316
|
-
def style_types
|
317
|
-
@style_types ||=
|
318
|
-
xml.styles.xpath('/styleSheet/cellXfs/xf').map {|xstyle|
|
319
|
-
style_type_by_num_fmt_id(num_fmt_id(xstyle))}
|
320
|
-
end
|
321
|
-
|
322
|
-
#returns the numFmtId value if it's available
|
323
|
-
def num_fmt_id(xstyle)
|
324
|
-
if xstyle.attributes['numFmtId']
|
325
|
-
xstyle.attributes['numFmtId'].value
|
326
|
-
else
|
327
|
-
nil
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
# Finds the type we think a style is; For example, fmtId 14 is a date
|
332
|
-
# style, so this would return :date.
|
333
|
-
#
|
334
|
-
# Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
|
335
|
-
# but in practice can sometimes be simply out of the usual "Any Language"
|
336
|
-
# id range that goes up to 49. For example, I have seen a numFmtId of
|
337
|
-
# 59 specified as a date. In Thai, 59 is a number format, so this seems
|
338
|
-
# like a bad idea, but we try to be flexible and just go with it.
|
339
|
-
def style_type_by_num_fmt_id(id)
|
340
|
-
return nil if id.nil?
|
341
|
-
|
342
|
-
id = id.to_i
|
343
|
-
NumFmtMap[id] || custom_style_types[id]
|
344
|
-
end
|
345
|
-
|
346
|
-
# Map of (numFmtId >= 164) (custom styles) to our best guess at the type
|
347
|
-
# ex. {164 => :date_time}
|
348
|
-
def custom_style_types
|
349
|
-
@custom_style_types ||=
|
350
|
-
xml.styles.xpath('/styleSheet/numFmts/numFmt').
|
351
|
-
inject({}) do |acc, xstyle|
|
352
|
-
|
353
|
-
acc[xstyle.attributes['numFmtId'].value.to_i] =
|
354
|
-
determine_custom_style_type(xstyle.attributes['formatCode'].value)
|
355
|
-
|
356
|
-
acc
|
357
|
-
end
|
358
|
-
end
|
359
|
-
|
360
|
-
# This is the least deterministic part of reading xlsx files. Due to
|
361
|
-
# custom styles, you can't know for sure when a date is a date other than
|
362
|
-
# looking at its format and gessing. It's not impossible to guess right,
|
363
|
-
# though.
|
364
|
-
#
|
365
|
-
# http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
|
366
|
-
def determine_custom_style_type(string)
|
367
|
-
return :float if string[0] == '_'
|
368
|
-
return :float if string[0] == ' 0'
|
369
|
-
|
370
|
-
# Looks for one of ymdhis outside of meta-stuff like [Red]
|
371
|
-
return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
|
372
|
-
|
373
|
-
return :unsupported
|
374
|
-
end
|
375
|
-
|
376
|
-
##
|
377
|
-
# The heart of typecasting. The ruby type is determined either explicitly
|
378
|
-
# from the cell xml or implicitly from the cell style, and this
|
379
|
-
# method expects that work to have been done already. This, then,
|
380
|
-
# takes the type we determined it to be and casts the cell value
|
381
|
-
# to that type.
|
382
|
-
#
|
383
|
-
# types:
|
384
|
-
# - s: shared string (see #shared_string)
|
385
|
-
# - n: number (cast to a float)
|
386
|
-
# - b: boolean
|
387
|
-
# - str: string
|
388
|
-
# - inlineStr: string
|
389
|
-
# - ruby symbol: for when type has been determined by style
|
390
|
-
#
|
391
|
-
# options:
|
392
|
-
# - shared_strings: needed for 's' (shared string) type
|
393
|
-
def self.cast(value, type, style, options = {})
|
394
|
-
return nil if value.nil? || value.empty?
|
395
|
-
|
396
|
-
# Sometimes the type is dictated by the style alone
|
397
|
-
if type.nil? ||
|
398
|
-
(type == 'n' && [:date, :time, :date_time].include?(style))
|
399
|
-
type = style
|
400
|
-
end
|
401
|
-
|
402
|
-
casted = case type
|
403
|
-
|
404
|
-
##
|
405
|
-
# There are few built-in types
|
406
|
-
##
|
407
|
-
|
408
|
-
when 's' # shared string
|
409
|
-
options[:shared_strings][value.to_i]
|
410
|
-
when 'n' # number
|
411
|
-
value.to_f
|
412
|
-
when 'b'
|
413
|
-
value.to_i == 1
|
414
|
-
when 'str'
|
415
|
-
value
|
416
|
-
when 'inlineStr'
|
417
|
-
value
|
418
|
-
|
419
|
-
##
|
420
|
-
# Type can also be determined by a style,
|
421
|
-
# detected earlier and cast here by its standardized symbol
|
422
|
-
##
|
423
|
-
|
424
|
-
when :string, :unsupported
|
425
|
-
value
|
426
|
-
when :fixnum
|
427
|
-
value.to_i
|
428
|
-
when :float
|
429
|
-
value.to_f
|
430
|
-
when :percentage
|
431
|
-
value.to_f / 100
|
432
|
-
# the trickiest. note that all these formats can vary on
|
433
|
-
# whether they actually contain a date, time, or datetime.
|
434
|
-
when :date, :time, :date_time
|
435
|
-
value = Float(value)
|
436
|
-
days_since_date_system_start = value.to_i
|
437
|
-
fraction_of_24 = value - days_since_date_system_start
|
438
|
-
|
439
|
-
# http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
|
440
|
-
date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
|
441
|
-
|
442
|
-
if fraction_of_24 > 0 # there is a time associated
|
443
|
-
seconds = (fraction_of_24 * 86400).round
|
444
|
-
return Time.utc(date.year, date.month, date.day) + seconds
|
445
|
-
else
|
446
|
-
return date
|
447
|
-
end
|
448
|
-
when :bignum
|
449
|
-
if defined?(BigDecimal)
|
450
|
-
BigDecimal.new(value)
|
451
|
-
else
|
452
|
-
value.to_f
|
453
|
-
end
|
454
|
-
|
455
|
-
##
|
456
|
-
# Beats me
|
457
|
-
##
|
458
|
-
|
459
|
-
else
|
460
|
-
value
|
461
|
-
end
|
462
|
-
|
463
|
-
if options[:url]
|
464
|
-
Hyperlink.new(options[:url], casted)
|
465
|
-
else
|
466
|
-
casted
|
467
|
-
end
|
468
|
-
end
|
469
|
-
|
470
|
-
## Returns the base_date from which to calculate dates.
|
471
|
-
# Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
|
472
|
-
# it's set in the Workbook's workbookPr.
|
473
|
-
# http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
|
474
|
-
def base_date
|
475
|
-
@base_date ||=
|
476
|
-
begin
|
477
|
-
return DATE_SYSTEM_1900 if xml.workbook == nil
|
478
|
-
xml.workbook.xpath("//workbook/workbookPr[@date1904]").each do |workbookPr|
|
479
|
-
return DATE_SYSTEM_1904 if workbookPr["date1904"] =~ /true|1/i
|
480
|
-
end
|
481
|
-
DATE_SYSTEM_1900
|
482
|
-
end
|
483
|
-
end
|
484
|
-
|
485
|
-
# Map of non-custom numFmtId to casting symbol
|
486
|
-
NumFmtMap = {
|
487
|
-
0 => :string, # General
|
488
|
-
1 => :fixnum, # 0
|
489
|
-
2 => :float, # 0.00
|
490
|
-
3 => :fixnum, # #,##0
|
491
|
-
4 => :float, # #,##0.00
|
492
|
-
5 => :unsupported, # $#,##0_);($#,##0)
|
493
|
-
6 => :unsupported, # $#,##0_);[Red]($#,##0)
|
494
|
-
7 => :unsupported, # $#,##0.00_);($#,##0.00)
|
495
|
-
8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
|
496
|
-
9 => :percentage, # 0%
|
497
|
-
10 => :percentage, # 0.00%
|
498
|
-
11 => :bignum, # 0.00E+00
|
499
|
-
12 => :unsupported, # # ?/?
|
500
|
-
13 => :unsupported, # # ??/??
|
501
|
-
14 => :date, # mm-dd-yy
|
502
|
-
15 => :date, # d-mmm-yy
|
503
|
-
16 => :date, # d-mmm
|
504
|
-
17 => :date, # mmm-yy
|
505
|
-
18 => :time, # h:mm AM/PM
|
506
|
-
19 => :time, # h:mm:ss AM/PM
|
507
|
-
20 => :time, # h:mm
|
508
|
-
21 => :time, # h:mm:ss
|
509
|
-
22 => :date_time, # m/d/yy h:mm
|
510
|
-
37 => :unsupported, # #,##0 ;(#,##0)
|
511
|
-
38 => :unsupported, # #,##0 ;[Red](#,##0)
|
512
|
-
39 => :unsupported, # #,##0.00;(#,##0.00)
|
513
|
-
40 => :unsupported, # #,##0.00;[Red](#,##0.00)
|
514
|
-
45 => :time, # mm:ss
|
515
|
-
46 => :time, # [h]:mm:ss
|
516
|
-
47 => :time, # mmss.0
|
517
|
-
48 => :bignum, # ##0.0E+0
|
518
|
-
49 => :unsupported # @
|
519
|
-
}
|
520
|
-
|
521
|
-
# For performance reasons, excel uses an optional SpreadsheetML feature
|
522
|
-
# that puts all strings in a separate xml file, and then references
|
523
|
-
# them by their index in that file.
|
524
|
-
#
|
525
|
-
# http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
|
526
|
-
def shared_strings
|
527
|
-
@shared_strings ||= begin
|
528
|
-
if xml.shared_strings
|
529
|
-
xml.shared_strings.xpath('/sst/si').map do |xsst|
|
530
|
-
# a shared string can be a single value...
|
531
|
-
sst = xsst.at_xpath('t/text()')
|
532
|
-
sst = sst.text if sst
|
533
|
-
# ... or a composite of seperately styled words/characters
|
534
|
-
sst ||= xsst.xpath('r/t/text()').map(&:text).join
|
535
|
-
end
|
536
|
-
else
|
537
|
-
[]
|
538
|
-
end
|
539
|
-
end
|
540
|
-
end
|
541
|
-
|
44
|
+
def open(file_path)
|
45
|
+
Document.new(file_path).tap(&:sheets)
|
542
46
|
end
|
543
|
-
|
47
|
+
alias parse open
|
544
48
|
end
|
545
49
|
end
|
data/test/date1904_test.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
|
3
5
|
describe SimpleXlsxReader do
|
@@ -5,9 +7,8 @@ describe SimpleXlsxReader do
|
|
5
7
|
let(:subject) { SimpleXlsxReader::Document.new(date1904_file) }
|
6
8
|
|
7
9
|
it 'supports converting dates with the 1904 date system' do
|
8
|
-
subject.to_hash.must_equal(
|
9
|
-
|
10
|
-
|
10
|
+
_(subject.to_hash).must_equal(
|
11
|
+
'date1904' => [[Date.parse('2014-05-01')]]
|
12
|
+
)
|
11
13
|
end
|
12
|
-
|
13
14
|
end
|
data/test/datetime_test.rb
CHANGED
@@ -1,19 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
|
3
5
|
describe SimpleXlsxReader do
|
4
|
-
let(:datetimes_file)
|
5
|
-
|
6
|
+
let(:datetimes_file) do
|
7
|
+
File.join(
|
8
|
+
File.dirname(__FILE__),
|
9
|
+
'datetimes.xlsx'
|
10
|
+
)
|
11
|
+
end
|
6
12
|
|
7
13
|
let(:subject) { SimpleXlsxReader::Document.new(datetimes_file) }
|
8
14
|
|
9
15
|
it 'converts date_times with the correct precision' do
|
10
|
-
subject.to_hash.must_equal(
|
11
|
-
|
12
|
-
[
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
_(subject.to_hash).must_equal(
|
17
|
+
'Datetimes' =>
|
18
|
+
[
|
19
|
+
[Time.parse('2013-08-19 18:29:59 UTC')],
|
20
|
+
[Time.parse('2013-08-19 18:30:00 UTC')],
|
21
|
+
[Time.parse('2013-08-19 18:30:01 UTC')],
|
22
|
+
[Time.parse('1899-12-30 00:30:00 UTC')]
|
23
|
+
]
|
24
|
+
)
|
17
25
|
end
|
18
|
-
|
19
26
|
end
|
data/test/gdocs_sheet_test.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
require 'time'
|
3
5
|
|
@@ -6,10 +8,9 @@ describe SimpleXlsxReader do
|
|
6
8
|
let(:subject) { SimpleXlsxReader::Document.new(one_sheet_file) }
|
7
9
|
|
8
10
|
it 'able to load file from google docs' do
|
9
|
-
subject.to_hash.must_equal(
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
_(subject.to_hash).must_equal(
|
12
|
+
'List 1' => [['Empty gdocs list 1']],
|
13
|
+
'List 2' => [['Empty gdocs list 2']]
|
14
|
+
)
|
13
15
|
end
|
14
|
-
|
15
16
|
end
|
@@ -1,15 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
|
3
5
|
describe SimpleXlsxReader do
|
4
|
-
let(:lower_case_shared_strings)
|
5
|
-
|
6
|
+
let(:lower_case_shared_strings) do
|
7
|
+
File.join(
|
8
|
+
File.dirname(__FILE__),
|
9
|
+
'lower_case_sharedstrings.xlsx'
|
10
|
+
)
|
11
|
+
end
|
6
12
|
|
7
13
|
let(:subject) { SimpleXlsxReader::Document.new(lower_case_shared_strings) }
|
8
14
|
|
9
|
-
|
10
15
|
describe '#to_hash' do
|
11
16
|
it 'should have the word Well in the first row' do
|
12
|
-
subject.sheets.first.rows[0].must_include('Well')
|
17
|
+
_(subject.sheets.first.rows.to_a[0]).must_include('Well')
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|