simple_xlsx_reader 1.0.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +38 -0
- data/CHANGELOG.md +7 -0
- data/README.md +190 -64
- data/Rakefile +3 -1
- data/lib/simple_xlsx_reader/document.rb +147 -0
- data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
- data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
- data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
- data/lib/simple_xlsx_reader/loader.rb +199 -0
- data/lib/simple_xlsx_reader/version.rb +3 -1
- data/lib/simple_xlsx_reader.rb +23 -519
- data/test/date1904_test.rb +5 -4
- data/test/datetime_test.rb +17 -10
- data/test/gdocs_sheet_test.rb +6 -5
- data/test/lower_case_sharedstrings_test.rb +9 -4
- data/test/performance_test.rb +85 -88
- data/test/shared_strings.xml +4 -0
- data/test/simple_xlsx_reader_test.rb +785 -375
- data/test/test_helper.rb +4 -1
- data/test/test_xlsx_builder.rb +104 -0
- metadata +16 -6
@@ -0,0 +1,256 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
module SimpleXlsxReader
|
6
|
+
class Loader
|
7
|
+
class SheetParser < Nokogiri::XML::SAX::Document
|
8
|
+
extend Forwardable
|
9
|
+
|
10
|
+
attr_accessor :xrels_file
|
11
|
+
attr_accessor :hyperlinks_by_cell
|
12
|
+
|
13
|
+
attr_reader :load_errors
|
14
|
+
|
15
|
+
def_delegators :@loader, :style_types, :shared_strings, :base_date
|
16
|
+
|
17
|
+
def initialize(file_io:, loader:)
|
18
|
+
@file_io = file_io
|
19
|
+
@loader = loader
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse(headers: false, &block)
|
23
|
+
raise 'parse called without a block; what should this do?'\
|
24
|
+
unless block_given?
|
25
|
+
|
26
|
+
@headers = headers
|
27
|
+
@each_callback = block
|
28
|
+
@load_errors = {}
|
29
|
+
@current_row_num = nil
|
30
|
+
@last_seen_row_idx = 0
|
31
|
+
@url = nil # silence warnings
|
32
|
+
@function = nil # silence warnings
|
33
|
+
@capture = nil # silence warnings
|
34
|
+
@dimension = nil # silence warnings
|
35
|
+
|
36
|
+
@file_io.rewind # in case we've already parsed this once
|
37
|
+
|
38
|
+
# In this project this is only used for GUI-made hyperlinks (as opposed
|
39
|
+
# to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
|
40
|
+
# the spreadsheet, and they come AFTER the sheet data. So, solution is
|
41
|
+
# to just stream-parse the file twice, first for the hyperlinks at the
|
42
|
+
# bottom of the file, then for the file itself. In the future it would
|
43
|
+
# be clever to use grep to extract the xml into its own smaller file.
|
44
|
+
if xrels_file&.grep(/hyperlink/)&.any?
|
45
|
+
xrels_file.rewind
|
46
|
+
load_gui_hyperlinks # represented as hyperlinks_by_cell
|
47
|
+
@file_io.rewind
|
48
|
+
end
|
49
|
+
|
50
|
+
Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
|
51
|
+
end
|
52
|
+
|
53
|
+
###
|
54
|
+
# SAX document hooks
|
55
|
+
|
56
|
+
def start_element(name, attrs = [])
|
57
|
+
case name
|
58
|
+
when 'dimension' then @dimension = attrs.last.last
|
59
|
+
when 'row'
|
60
|
+
@current_row_num = attrs.find {|(k, v)| k == 'r'}&.last&.to_i
|
61
|
+
@current_row = Array.new(column_length)
|
62
|
+
when 'c'
|
63
|
+
attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
|
64
|
+
@cell_name = attrs['r']
|
65
|
+
@type = attrs['t']
|
66
|
+
@style = attrs['s'] && style_types[attrs['s'].to_i]
|
67
|
+
when 'f' then @function = true
|
68
|
+
when 'v', 't' then @capture = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def characters(string)
|
73
|
+
if @function
|
74
|
+
# the only "function" we support is a hyperlink
|
75
|
+
@url = string.slice(/HYPERLINK\("(.*?)"/, 1)
|
76
|
+
end
|
77
|
+
|
78
|
+
return unless @capture
|
79
|
+
|
80
|
+
@current_row[cell_idx] =
|
81
|
+
begin
|
82
|
+
SimpleXlsxReader::Loader.cast(
|
83
|
+
string.strip, @type, @style,
|
84
|
+
url: @url || hyperlinks_by_cell&.[](@cell_name),
|
85
|
+
shared_strings: shared_strings,
|
86
|
+
base_date: base_date
|
87
|
+
)
|
88
|
+
rescue StandardError => e
|
89
|
+
column, row = @cell_name.match(/([A-Z]+)([0-9]+)/).captures
|
90
|
+
col_idx = column_letter_to_number(column) - 1
|
91
|
+
row_idx = row.to_i - 1
|
92
|
+
|
93
|
+
if !SimpleXlsxReader.configuration.catch_cell_load_errors
|
94
|
+
error = CellLoadError.new(
|
95
|
+
"Row #{row_idx}, Col #{col_idx}: #{e.message}"
|
96
|
+
)
|
97
|
+
error.set_backtrace(e.backtrace)
|
98
|
+
raise error
|
99
|
+
else
|
100
|
+
@load_errors[[row_idx, col_idx]] = e.message
|
101
|
+
|
102
|
+
string.strip
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def end_element(name)
|
108
|
+
case name
|
109
|
+
when 'row'
|
110
|
+
if @headers == true # ya a little funky
|
111
|
+
@headers = @current_row
|
112
|
+
elsif @headers.is_a?(Hash)
|
113
|
+
test_headers_hash_against_current_row
|
114
|
+
# in case there were empty rows before finding the header
|
115
|
+
@last_seen_row_idx = @current_row_num - 1
|
116
|
+
elsif @headers.respond_to?(:call)
|
117
|
+
@headers = @current_row if @headers.call(@current_row)
|
118
|
+
# in case there were empty rows before finding the header
|
119
|
+
@last_seen_row_idx = @current_row_num - 1
|
120
|
+
elsif @headers
|
121
|
+
possibly_yield_empty_rows(headers: true)
|
122
|
+
yield_row(@current_row, headers: true)
|
123
|
+
else
|
124
|
+
possibly_yield_empty_rows(headers: false)
|
125
|
+
yield_row(@current_row, headers: false)
|
126
|
+
end
|
127
|
+
|
128
|
+
@last_seen_row_idx += 1
|
129
|
+
|
130
|
+
# Note that excel writes a '/worksheet/dimension' node we can get
|
131
|
+
# this from, but some libs (ex. simple_xlsx_writer) don't record it.
|
132
|
+
# In that case, we assume the data is of uniform column length and
|
133
|
+
# store the column name of the last header row we see. Obviously this
|
134
|
+
# isn't the most robust strategy, but it likely fits 99% of use cases
|
135
|
+
# considering it's not a problem with actual excel docs.
|
136
|
+
@dimension = "A1:#{@cell_name}" if @dimension.nil?
|
137
|
+
when 'v', 't' then @capture = false
|
138
|
+
when 'f' then @function = false
|
139
|
+
when 'c' then @url = nil
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
###
|
144
|
+
# /End SAX hooks
|
145
|
+
|
146
|
+
def test_headers_hash_against_current_row
|
147
|
+
found = false
|
148
|
+
|
149
|
+
@current_row.each_with_index do |cell, cell_idx|
|
150
|
+
@headers.each_pair do |key, search|
|
151
|
+
if search.is_a?(String) ? cell == search : cell&.match?(search)
|
152
|
+
found = true
|
153
|
+
@current_row[cell_idx] = key
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
@headers = @current_row if found
|
159
|
+
end
|
160
|
+
|
161
|
+
def possibly_yield_empty_rows(headers:)
|
162
|
+
while @current_row_num && @current_row_num > @last_seen_row_idx + 1
|
163
|
+
@last_seen_row_idx += 1
|
164
|
+
yield_row(Array.new(column_length), headers: headers)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def yield_row(row, headers:)
|
169
|
+
if headers
|
170
|
+
@each_callback.call(Hash[@headers.zip(row)])
|
171
|
+
else
|
172
|
+
@each_callback.call(row)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# This sax-parses the whole sheet, just to extract hyperlink refs at the end.
|
177
|
+
def load_gui_hyperlinks
|
178
|
+
self.hyperlinks_by_cell =
|
179
|
+
HyperlinksParser.parse(@file_io, xrels: xrels)
|
180
|
+
end
|
181
|
+
|
182
|
+
class HyperlinksParser < Nokogiri::XML::SAX::Document
|
183
|
+
def initialize(file_io, xrels:)
|
184
|
+
@file_io = file_io
|
185
|
+
@xrels = xrels
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.parse(file_io, xrels:)
|
189
|
+
new(file_io, xrels: xrels).parse
|
190
|
+
end
|
191
|
+
|
192
|
+
def parse
|
193
|
+
@hyperlinks_by_cell = {}
|
194
|
+
Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
|
195
|
+
@hyperlinks_by_cell
|
196
|
+
end
|
197
|
+
|
198
|
+
def start_element(name, attrs)
|
199
|
+
case name
|
200
|
+
when 'hyperlink'
|
201
|
+
attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
|
202
|
+
id = attrs['id'] || attrs['r:id']
|
203
|
+
|
204
|
+
@hyperlinks_by_cell[attrs['ref']] =
|
205
|
+
@xrels.at_xpath(%(//*[@Id="#{id}"])).attr('Target')
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def xrels
|
211
|
+
@xrels ||= Nokogiri::XML(xrels_file.read) if xrels_file
|
212
|
+
end
|
213
|
+
|
214
|
+
def column_length
|
215
|
+
return 0 unless @dimension
|
216
|
+
|
217
|
+
@column_length ||= column_letter_to_number(last_cell_letter)
|
218
|
+
end
|
219
|
+
|
220
|
+
def cell_idx
|
221
|
+
column_letter_to_number(@cell_name.scan(/[A-Z]+/).first) - 1
|
222
|
+
end
|
223
|
+
|
224
|
+
##
|
225
|
+
# Returns the last column name, ex. 'E'
|
226
|
+
def last_cell_letter
|
227
|
+
return unless @dimension
|
228
|
+
|
229
|
+
@dimension.scan(/:([A-Z]+)/)&.first&.first || 'A'
|
230
|
+
end
|
231
|
+
|
232
|
+
# formula fits an exponential factorial function of the form:
|
233
|
+
# 'A' = 1
|
234
|
+
# 'B' = 2
|
235
|
+
# 'Z' = 26
|
236
|
+
# 'AA' = 26 * 1 + 1
|
237
|
+
# 'AZ' = 26 * 1 + 26
|
238
|
+
# 'BA' = 26 * 2 + 1
|
239
|
+
# 'ZA' = 26 * 26 + 1
|
240
|
+
# 'ZZ' = 26 * 26 + 26
|
241
|
+
# 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
|
242
|
+
# 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
|
243
|
+
# 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
|
244
|
+
# 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
|
245
|
+
def column_letter_to_number(column_letter)
|
246
|
+
pow = column_letter.length - 1
|
247
|
+
result = 0
|
248
|
+
column_letter.each_byte do |b|
|
249
|
+
result += 26**pow * (b - 64)
|
250
|
+
pow -= 1
|
251
|
+
end
|
252
|
+
result
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleXlsxReader
|
4
|
+
class Loader
|
5
|
+
StyleTypesParser = Struct.new(:file_io) do
|
6
|
+
def self.parse(file_io)
|
7
|
+
new(file_io).tap(&:parse).style_types
|
8
|
+
end
|
9
|
+
|
10
|
+
# Map of non-custom numFmtId to casting symbol
|
11
|
+
NumFmtMap = {
|
12
|
+
0 => :string, # General
|
13
|
+
1 => :fixnum, # 0
|
14
|
+
2 => :float, # 0.00
|
15
|
+
3 => :fixnum, # #,##0
|
16
|
+
4 => :float, # #,##0.00
|
17
|
+
5 => :unsupported, # $#,##0_);($#,##0)
|
18
|
+
6 => :unsupported, # $#,##0_);[Red]($#,##0)
|
19
|
+
7 => :unsupported, # $#,##0.00_);($#,##0.00)
|
20
|
+
8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
|
21
|
+
9 => :percentage, # 0%
|
22
|
+
10 => :percentage, # 0.00%
|
23
|
+
11 => :bignum, # 0.00E+00
|
24
|
+
12 => :unsupported, # # ?/?
|
25
|
+
13 => :unsupported, # # ??/??
|
26
|
+
14 => :date, # mm-dd-yy
|
27
|
+
15 => :date, # d-mmm-yy
|
28
|
+
16 => :date, # d-mmm
|
29
|
+
17 => :date, # mmm-yy
|
30
|
+
18 => :time, # h:mm AM/PM
|
31
|
+
19 => :time, # h:mm:ss AM/PM
|
32
|
+
20 => :time, # h:mm
|
33
|
+
21 => :time, # h:mm:ss
|
34
|
+
22 => :date_time, # m/d/yy h:mm
|
35
|
+
37 => :unsupported, # #,##0 ;(#,##0)
|
36
|
+
38 => :unsupported, # #,##0 ;[Red](#,##0)
|
37
|
+
39 => :unsupported, # #,##0.00;(#,##0.00)
|
38
|
+
40 => :unsupported, # #,##0.00;[Red](#,##0.00)
|
39
|
+
45 => :time, # mm:ss
|
40
|
+
46 => :time, # [h]:mm:ss
|
41
|
+
47 => :time, # mmss.0
|
42
|
+
48 => :bignum, # ##0.0E+0
|
43
|
+
49 => :unsupported # @
|
44
|
+
}.freeze
|
45
|
+
|
46
|
+
def parse
|
47
|
+
@xml = Nokogiri::XML(file_io.read).remove_namespaces!
|
48
|
+
end
|
49
|
+
|
50
|
+
# Excel doesn't record types for some cells, only its display style, so
|
51
|
+
# we have to back out the type from that style.
|
52
|
+
#
|
53
|
+
# Some of these styles can be determined from a known set (see NumFmtMap),
|
54
|
+
# while others are 'custom' and we have to make a best guess.
|
55
|
+
#
|
56
|
+
# This is the array of types corresponding to the styles a spreadsheet
|
57
|
+
# uses, and includes both the known style types and the custom styles.
|
58
|
+
#
|
59
|
+
# Note that the xml sheet cells that use this don't reference the
|
60
|
+
# numFmtId, but instead the array index of a style in the stored list of
|
61
|
+
# only the styles used in the spreadsheet (which can be either known or
|
62
|
+
# custom). Hence this style types array, rather than a map of numFmtId to
|
63
|
+
# type.
|
64
|
+
def style_types
|
65
|
+
@xml.xpath('/styleSheet/cellXfs/xf').map do |xstyle|
|
66
|
+
style_type_by_num_fmt_id(
|
67
|
+
xstyle.attributes['numFmtId']&.value
|
68
|
+
)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Finds the type we think a style is; For example, fmtId 14 is a date
|
73
|
+
# style, so this would return :date.
|
74
|
+
#
|
75
|
+
# Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
|
76
|
+
# but in practice can sometimes be simply out of the usual "Any Language"
|
77
|
+
# id range that goes up to 49. For example, I have seen a numFmtId of
|
78
|
+
# 59 specified as a date. In Thai, 59 is a number format, so this seems
|
79
|
+
# like a bad idea, but we try to be flexible and just go with it.
|
80
|
+
def style_type_by_num_fmt_id(id)
|
81
|
+
return nil if id.nil?
|
82
|
+
|
83
|
+
id = id.to_i
|
84
|
+
NumFmtMap[id] || custom_style_types[id]
|
85
|
+
end
|
86
|
+
|
87
|
+
# Map of (numFmtId >= 164) (custom styles) to our best guess at the type
|
88
|
+
# ex. {164 => :date_time}
|
89
|
+
def custom_style_types
|
90
|
+
@custom_style_types ||=
|
91
|
+
@xml.xpath('/styleSheet/numFmts/numFmt')
|
92
|
+
.each_with_object({}) do |xstyle, acc|
|
93
|
+
acc[xstyle.attributes['numFmtId'].value.to_i] =
|
94
|
+
determine_custom_style_type(xstyle.attributes['formatCode'].value)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# This is the least deterministic part of reading xlsx files. Due to
|
99
|
+
# custom styles, you can't know for sure when a date is a date other than
|
100
|
+
# looking at its format and gessing. It's not impossible to guess right,
|
101
|
+
# though.
|
102
|
+
#
|
103
|
+
# http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
|
104
|
+
def determine_custom_style_type(string)
|
105
|
+
return :float if string[0] == '_'
|
106
|
+
return :float if string[0] == ' 0'
|
107
|
+
|
108
|
+
# Looks for one of ymdhis outside of meta-stuff like [Red]
|
109
|
+
return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
|
110
|
+
|
111
|
+
:unsupported
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleXlsxReader
|
4
|
+
class Loader
|
5
|
+
WorkbookParser = Struct.new(:file_io) do
|
6
|
+
def self.parse(file_io)
|
7
|
+
parser = new(file_io).tap(&:parse)
|
8
|
+
[parser.sheet_toc, parser.base_date]
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse
|
12
|
+
@xml = Nokogiri::XML(file_io.read).remove_namespaces!
|
13
|
+
end
|
14
|
+
|
15
|
+
# Table of contents for the sheets, ex. {'Authors' => 0, ...}
|
16
|
+
def sheet_toc
|
17
|
+
@xml.xpath('/workbook/sheets/sheet')
|
18
|
+
.each_with_object({}) do |sheet, acc|
|
19
|
+
acc[sheet.attributes['name'].value] =
|
20
|
+
sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
## Returns the base_date from which to calculate dates.
|
25
|
+
# Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
|
26
|
+
# it's set in the Workbook's workbookPr.
|
27
|
+
# http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
|
28
|
+
def base_date
|
29
|
+
return DATE_SYSTEM_1900 if @xml.nil?
|
30
|
+
|
31
|
+
@xml.xpath('//workbook/workbookPr[@date1904]').each do |workbookPr|
|
32
|
+
return DATE_SYSTEM_1904 if workbookPr['date1904'] =~ /true|1/i
|
33
|
+
end
|
34
|
+
|
35
|
+
DATE_SYSTEM_1900
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleXlsxReader
|
4
|
+
class Loader < Struct.new(:file_path)
|
5
|
+
attr_accessor :shared_strings, :sheet_parsers, :sheet_toc, :style_types, :base_date
|
6
|
+
|
7
|
+
def init_sheets
|
8
|
+
ZipReader.new(
|
9
|
+
file_path: file_path,
|
10
|
+
loader: self
|
11
|
+
).read
|
12
|
+
|
13
|
+
sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
|
14
|
+
# sheet_number is *not* the index into xml.sheet_parsers
|
15
|
+
SimpleXlsxReader::Document::Sheet.new(
|
16
|
+
name: sheet_name,
|
17
|
+
sheet_parser: sheet_parsers[i]
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
ZipReader = Struct.new(:file_path, :loader, keyword_init: true) do
|
23
|
+
attr_reader :zip
|
24
|
+
|
25
|
+
def initialize(*args)
|
26
|
+
super
|
27
|
+
@zip = SimpleXlsxReader::Zip.open(file_path)
|
28
|
+
end
|
29
|
+
|
30
|
+
def read
|
31
|
+
entry_at('xl/workbook.xml') do |file_io|
|
32
|
+
loader.sheet_toc, loader.base_date = *WorkbookParser.parse(file_io)
|
33
|
+
end
|
34
|
+
|
35
|
+
entry_at('xl/styles.xml') do |file_io|
|
36
|
+
loader.style_types = StyleTypesParser.parse(file_io)
|
37
|
+
end
|
38
|
+
|
39
|
+
# optional feature used by excel,
|
40
|
+
# but not often used by xlsx generation libraries
|
41
|
+
if (ss_entry = entry_at('xl/sharedStrings.xml'))
|
42
|
+
ss_entry.get_input_stream do |file|
|
43
|
+
loader.shared_strings = SharedStringsParser.parse(file)
|
44
|
+
end
|
45
|
+
else
|
46
|
+
loader.shared_strings = []
|
47
|
+
end
|
48
|
+
|
49
|
+
loader.sheet_parsers = []
|
50
|
+
|
51
|
+
# Sometimes there's a zero-index sheet.xml, ex.
|
52
|
+
# Google Docs creates:
|
53
|
+
# xl/worksheets/sheet.xml
|
54
|
+
# xl/worksheets/sheet1.xml
|
55
|
+
# xl/worksheets/sheet2.xml
|
56
|
+
# While Excel creates:
|
57
|
+
# xl/worksheets/sheet1.xml
|
58
|
+
# xl/worksheets/sheet2.xml
|
59
|
+
add_sheet_parser_at_index(nil)
|
60
|
+
|
61
|
+
i = 1
|
62
|
+
while(add_sheet_parser_at_index(i)) do
|
63
|
+
i += 1
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def entry_at(path, &block)
|
68
|
+
# Older and newer (post-mid-2021) RubyZip normalizes pathnames,
|
69
|
+
# but unfortunately there is a time in between where it doesn't.
|
70
|
+
# Rather than require a specific version, let's just be flexible.
|
71
|
+
entry =
|
72
|
+
zip.find_entry(path) || # *nix-generated
|
73
|
+
zip.find_entry(path.tr('/', '\\')) || # Windows-generated
|
74
|
+
zip.find_entry(path.downcase) || # Sometimes it's lowercase
|
75
|
+
zip.find_entry(path.tr('/', '\\').downcase) # Sometimes it's lowercase
|
76
|
+
|
77
|
+
if block
|
78
|
+
entry.get_input_stream(&block)
|
79
|
+
else
|
80
|
+
entry
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_sheet_parser_at_index(i)
|
85
|
+
sheet_file_name = "xl/worksheets/sheet#{i}.xml"
|
86
|
+
return unless (entry = entry_at(sheet_file_name))
|
87
|
+
|
88
|
+
parser =
|
89
|
+
SheetParser.new(
|
90
|
+
file_io: entry.get_input_stream,
|
91
|
+
loader: loader
|
92
|
+
)
|
93
|
+
|
94
|
+
relationship_file_name = "xl/worksheets/_rels/sheet#{i}.xml.rels"
|
95
|
+
if (rel = entry_at(relationship_file_name))
|
96
|
+
parser.xrels_file = rel.get_input_stream
|
97
|
+
end
|
98
|
+
|
99
|
+
loader.sheet_parsers << parser
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# The heart of typecasting. The ruby type is determined either explicitly
|
105
|
+
# from the cell xml or implicitly from the cell style, and this
|
106
|
+
# method expects that work to have been done already. This, then,
|
107
|
+
# takes the type we determined it to be and casts the cell value
|
108
|
+
# to that type.
|
109
|
+
#
|
110
|
+
# types:
|
111
|
+
# - s: shared string (see #shared_string)
|
112
|
+
# - n: number (cast to a float)
|
113
|
+
# - b: boolean
|
114
|
+
# - str: string
|
115
|
+
# - inlineStr: string
|
116
|
+
# - ruby symbol: for when type has been determined by style
|
117
|
+
#
|
118
|
+
# options:
|
119
|
+
# - shared_strings: needed for 's' (shared string) type
|
120
|
+
def self.cast(value, type, style, options = {})
|
121
|
+
return nil if value.nil? || value.empty?
|
122
|
+
|
123
|
+
# Sometimes the type is dictated by the style alone
|
124
|
+
if type.nil? ||
|
125
|
+
(type == 'n' && %i[date time date_time].include?(style))
|
126
|
+
type = style
|
127
|
+
end
|
128
|
+
|
129
|
+
casted =
|
130
|
+
case type
|
131
|
+
|
132
|
+
##
|
133
|
+
# There are few built-in types
|
134
|
+
##
|
135
|
+
|
136
|
+
when 's' # shared string
|
137
|
+
options[:shared_strings][value.to_i]
|
138
|
+
when 'n' # number
|
139
|
+
value.to_f
|
140
|
+
when 'b'
|
141
|
+
value.to_i == 1
|
142
|
+
when 'str'
|
143
|
+
value
|
144
|
+
when 'inlineStr'
|
145
|
+
value
|
146
|
+
|
147
|
+
##
|
148
|
+
# Type can also be determined by a style,
|
149
|
+
# detected earlier and cast here by its standardized symbol
|
150
|
+
##
|
151
|
+
|
152
|
+
when :string, :unsupported
|
153
|
+
value
|
154
|
+
when :fixnum
|
155
|
+
value.to_i
|
156
|
+
when :float
|
157
|
+
value.to_f
|
158
|
+
when :percentage
|
159
|
+
value.to_f / 100
|
160
|
+
# the trickiest. note that all these formats can vary on
|
161
|
+
# whether they actually contain a date, time, or datetime.
|
162
|
+
when :date, :time, :date_time
|
163
|
+
value = Float(value)
|
164
|
+
days_since_date_system_start = value.to_i
|
165
|
+
fraction_of_24 = value - days_since_date_system_start
|
166
|
+
|
167
|
+
# http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
|
168
|
+
date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
|
169
|
+
|
170
|
+
if fraction_of_24 > 0 # there is a time associated
|
171
|
+
seconds = (fraction_of_24 * 86_400).round
|
172
|
+
return Time.utc(date.year, date.month, date.day) + seconds
|
173
|
+
else
|
174
|
+
return date
|
175
|
+
end
|
176
|
+
when :bignum
|
177
|
+
if defined?(BigDecimal)
|
178
|
+
BigDecimal(value)
|
179
|
+
else
|
180
|
+
value.to_f
|
181
|
+
end
|
182
|
+
|
183
|
+
##
|
184
|
+
# Beats me
|
185
|
+
##
|
186
|
+
|
187
|
+
else
|
188
|
+
value
|
189
|
+
end
|
190
|
+
|
191
|
+
if options[:url]
|
192
|
+
Hyperlink.new(options[:url], casted)
|
193
|
+
else
|
194
|
+
casted
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|