simple_xlsx_reader 1.0.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +38 -0
- data/CHANGELOG.md +7 -0
- data/README.md +190 -64
- data/Rakefile +3 -1
- data/lib/simple_xlsx_reader/document.rb +147 -0
- data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
- data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
- data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
- data/lib/simple_xlsx_reader/loader.rb +199 -0
- data/lib/simple_xlsx_reader/version.rb +3 -1
- data/lib/simple_xlsx_reader.rb +23 -519
- data/test/date1904_test.rb +5 -4
- data/test/datetime_test.rb +17 -10
- data/test/gdocs_sheet_test.rb +6 -5
- data/test/lower_case_sharedstrings_test.rb +9 -4
- data/test/performance_test.rb +85 -88
- data/test/shared_strings.xml +4 -0
- data/test/simple_xlsx_reader_test.rb +785 -375
- data/test/test_helper.rb +4 -1
- data/test/test_xlsx_builder.rb +104 -0
- metadata +16 -6
@@ -0,0 +1,256 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
module SimpleXlsxReader
|
6
|
+
class Loader
|
7
|
+
class SheetParser < Nokogiri::XML::SAX::Document
|
8
|
+
extend Forwardable
|
9
|
+
|
10
|
+
attr_accessor :xrels_file
|
11
|
+
attr_accessor :hyperlinks_by_cell
|
12
|
+
|
13
|
+
attr_reader :load_errors
|
14
|
+
|
15
|
+
def_delegators :@loader, :style_types, :shared_strings, :base_date
|
16
|
+
|
17
|
+
def initialize(file_io:, loader:)
|
18
|
+
@file_io = file_io
|
19
|
+
@loader = loader
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse(headers: false, &block)
|
23
|
+
raise 'parse called without a block; what should this do?'\
|
24
|
+
unless block_given?
|
25
|
+
|
26
|
+
@headers = headers
|
27
|
+
@each_callback = block
|
28
|
+
@load_errors = {}
|
29
|
+
@current_row_num = nil
|
30
|
+
@last_seen_row_idx = 0
|
31
|
+
@url = nil # silence warnings
|
32
|
+
@function = nil # silence warnings
|
33
|
+
@capture = nil # silence warnings
|
34
|
+
@dimension = nil # silence warnings
|
35
|
+
|
36
|
+
@file_io.rewind # in case we've already parsed this once
|
37
|
+
|
38
|
+
# In this project this is only used for GUI-made hyperlinks (as opposed
|
39
|
+
# to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
|
40
|
+
# the spreadsheet, and they come AFTER the sheet data. So, solution is
|
41
|
+
# to just stream-parse the file twice, first for the hyperlinks at the
|
42
|
+
# bottom of the file, then for the file itself. In the future it would
|
43
|
+
# be clever to use grep to extract the xml into its own smaller file.
|
44
|
+
if xrels_file&.grep(/hyperlink/)&.any?
|
45
|
+
xrels_file.rewind
|
46
|
+
load_gui_hyperlinks # represented as hyperlinks_by_cell
|
47
|
+
@file_io.rewind
|
48
|
+
end
|
49
|
+
|
50
|
+
Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
|
51
|
+
end
|
52
|
+
|
53
|
+
###
|
54
|
+
# SAX document hooks
|
55
|
+
|
56
|
+
def start_element(name, attrs = [])
|
57
|
+
case name
|
58
|
+
when 'dimension' then @dimension = attrs.last.last
|
59
|
+
when 'row'
|
60
|
+
@current_row_num = attrs.find {|(k, v)| k == 'r'}&.last&.to_i
|
61
|
+
@current_row = Array.new(column_length)
|
62
|
+
when 'c'
|
63
|
+
attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
|
64
|
+
@cell_name = attrs['r']
|
65
|
+
@type = attrs['t']
|
66
|
+
@style = attrs['s'] && style_types[attrs['s'].to_i]
|
67
|
+
when 'f' then @function = true
|
68
|
+
when 'v', 't' then @capture = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def characters(string)
|
73
|
+
if @function
|
74
|
+
# the only "function" we support is a hyperlink
|
75
|
+
@url = string.slice(/HYPERLINK\("(.*?)"/, 1)
|
76
|
+
end
|
77
|
+
|
78
|
+
return unless @capture
|
79
|
+
|
80
|
+
@current_row[cell_idx] =
|
81
|
+
begin
|
82
|
+
SimpleXlsxReader::Loader.cast(
|
83
|
+
string.strip, @type, @style,
|
84
|
+
url: @url || hyperlinks_by_cell&.[](@cell_name),
|
85
|
+
shared_strings: shared_strings,
|
86
|
+
base_date: base_date
|
87
|
+
)
|
88
|
+
rescue StandardError => e
|
89
|
+
column, row = @cell_name.match(/([A-Z]+)([0-9]+)/).captures
|
90
|
+
col_idx = column_letter_to_number(column) - 1
|
91
|
+
row_idx = row.to_i - 1
|
92
|
+
|
93
|
+
if !SimpleXlsxReader.configuration.catch_cell_load_errors
|
94
|
+
error = CellLoadError.new(
|
95
|
+
"Row #{row_idx}, Col #{col_idx}: #{e.message}"
|
96
|
+
)
|
97
|
+
error.set_backtrace(e.backtrace)
|
98
|
+
raise error
|
99
|
+
else
|
100
|
+
@load_errors[[row_idx, col_idx]] = e.message
|
101
|
+
|
102
|
+
string.strip
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def end_element(name)
|
108
|
+
case name
|
109
|
+
when 'row'
|
110
|
+
if @headers == true # ya a little funky
|
111
|
+
@headers = @current_row
|
112
|
+
elsif @headers.is_a?(Hash)
|
113
|
+
test_headers_hash_against_current_row
|
114
|
+
# in case there were empty rows before finding the header
|
115
|
+
@last_seen_row_idx = @current_row_num - 1
|
116
|
+
elsif @headers.respond_to?(:call)
|
117
|
+
@headers = @current_row if @headers.call(@current_row)
|
118
|
+
# in case there were empty rows before finding the header
|
119
|
+
@last_seen_row_idx = @current_row_num - 1
|
120
|
+
elsif @headers
|
121
|
+
possibly_yield_empty_rows(headers: true)
|
122
|
+
yield_row(@current_row, headers: true)
|
123
|
+
else
|
124
|
+
possibly_yield_empty_rows(headers: false)
|
125
|
+
yield_row(@current_row, headers: false)
|
126
|
+
end
|
127
|
+
|
128
|
+
@last_seen_row_idx += 1
|
129
|
+
|
130
|
+
# Note that excel writes a '/worksheet/dimension' node we can get
|
131
|
+
# this from, but some libs (ex. simple_xlsx_writer) don't record it.
|
132
|
+
# In that case, we assume the data is of uniform column length and
|
133
|
+
# store the column name of the last header row we see. Obviously this
|
134
|
+
# isn't the most robust strategy, but it likely fits 99% of use cases
|
135
|
+
# considering it's not a problem with actual excel docs.
|
136
|
+
@dimension = "A1:#{@cell_name}" if @dimension.nil?
|
137
|
+
when 'v', 't' then @capture = false
|
138
|
+
when 'f' then @function = false
|
139
|
+
when 'c' then @url = nil
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
###
|
144
|
+
# /End SAX hooks
|
145
|
+
|
146
|
+
def test_headers_hash_against_current_row
|
147
|
+
found = false
|
148
|
+
|
149
|
+
@current_row.each_with_index do |cell, cell_idx|
|
150
|
+
@headers.each_pair do |key, search|
|
151
|
+
if search.is_a?(String) ? cell == search : cell&.match?(search)
|
152
|
+
found = true
|
153
|
+
@current_row[cell_idx] = key
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
@headers = @current_row if found
|
159
|
+
end
|
160
|
+
|
161
|
+
def possibly_yield_empty_rows(headers:)
|
162
|
+
while @current_row_num && @current_row_num > @last_seen_row_idx + 1
|
163
|
+
@last_seen_row_idx += 1
|
164
|
+
yield_row(Array.new(column_length), headers: headers)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def yield_row(row, headers:)
|
169
|
+
if headers
|
170
|
+
@each_callback.call(Hash[@headers.zip(row)])
|
171
|
+
else
|
172
|
+
@each_callback.call(row)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# This sax-parses the whole sheet, just to extract hyperlink refs at the end.
|
177
|
+
def load_gui_hyperlinks
|
178
|
+
self.hyperlinks_by_cell =
|
179
|
+
HyperlinksParser.parse(@file_io, xrels: xrels)
|
180
|
+
end
|
181
|
+
|
182
|
+
class HyperlinksParser < Nokogiri::XML::SAX::Document
|
183
|
+
def initialize(file_io, xrels:)
|
184
|
+
@file_io = file_io
|
185
|
+
@xrels = xrels
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.parse(file_io, xrels:)
|
189
|
+
new(file_io, xrels: xrels).parse
|
190
|
+
end
|
191
|
+
|
192
|
+
def parse
|
193
|
+
@hyperlinks_by_cell = {}
|
194
|
+
Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
|
195
|
+
@hyperlinks_by_cell
|
196
|
+
end
|
197
|
+
|
198
|
+
def start_element(name, attrs)
|
199
|
+
case name
|
200
|
+
when 'hyperlink'
|
201
|
+
attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
|
202
|
+
id = attrs['id'] || attrs['r:id']
|
203
|
+
|
204
|
+
@hyperlinks_by_cell[attrs['ref']] =
|
205
|
+
@xrels.at_xpath(%(//*[@Id="#{id}"])).attr('Target')
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def xrels
|
211
|
+
@xrels ||= Nokogiri::XML(xrels_file.read) if xrels_file
|
212
|
+
end
|
213
|
+
|
214
|
+
def column_length
|
215
|
+
return 0 unless @dimension
|
216
|
+
|
217
|
+
@column_length ||= column_letter_to_number(last_cell_letter)
|
218
|
+
end
|
219
|
+
|
220
|
+
def cell_idx
|
221
|
+
column_letter_to_number(@cell_name.scan(/[A-Z]+/).first) - 1
|
222
|
+
end
|
223
|
+
|
224
|
+
##
|
225
|
+
# Returns the last column name, ex. 'E'
|
226
|
+
def last_cell_letter
|
227
|
+
return unless @dimension
|
228
|
+
|
229
|
+
@dimension.scan(/:([A-Z]+)/)&.first&.first || 'A'
|
230
|
+
end
|
231
|
+
|
232
|
+
# formula fits an exponential factorial function of the form:
|
233
|
+
# 'A' = 1
|
234
|
+
# 'B' = 2
|
235
|
+
# 'Z' = 26
|
236
|
+
# 'AA' = 26 * 1 + 1
|
237
|
+
# 'AZ' = 26 * 1 + 26
|
238
|
+
# 'BA' = 26 * 2 + 1
|
239
|
+
# 'ZA' = 26 * 26 + 1
|
240
|
+
# 'ZZ' = 26 * 26 + 26
|
241
|
+
# 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
|
242
|
+
# 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
|
243
|
+
# 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
|
244
|
+
# 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
|
245
|
+
def column_letter_to_number(column_letter)
|
246
|
+
pow = column_letter.length - 1
|
247
|
+
result = 0
|
248
|
+
column_letter.each_byte do |b|
|
249
|
+
result += 26**pow * (b - 64)
|
250
|
+
pow -= 1
|
251
|
+
end
|
252
|
+
result
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleXlsxReader
|
4
|
+
class Loader
|
5
|
+
StyleTypesParser = Struct.new(:file_io) do
|
6
|
+
def self.parse(file_io)
|
7
|
+
new(file_io).tap(&:parse).style_types
|
8
|
+
end
|
9
|
+
|
10
|
+
# Map of non-custom numFmtId to casting symbol
|
11
|
+
NumFmtMap = {
|
12
|
+
0 => :string, # General
|
13
|
+
1 => :fixnum, # 0
|
14
|
+
2 => :float, # 0.00
|
15
|
+
3 => :fixnum, # #,##0
|
16
|
+
4 => :float, # #,##0.00
|
17
|
+
5 => :unsupported, # $#,##0_);($#,##0)
|
18
|
+
6 => :unsupported, # $#,##0_);[Red]($#,##0)
|
19
|
+
7 => :unsupported, # $#,##0.00_);($#,##0.00)
|
20
|
+
8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
|
21
|
+
9 => :percentage, # 0%
|
22
|
+
10 => :percentage, # 0.00%
|
23
|
+
11 => :bignum, # 0.00E+00
|
24
|
+
12 => :unsupported, # # ?/?
|
25
|
+
13 => :unsupported, # # ??/??
|
26
|
+
14 => :date, # mm-dd-yy
|
27
|
+
15 => :date, # d-mmm-yy
|
28
|
+
16 => :date, # d-mmm
|
29
|
+
17 => :date, # mmm-yy
|
30
|
+
18 => :time, # h:mm AM/PM
|
31
|
+
19 => :time, # h:mm:ss AM/PM
|
32
|
+
20 => :time, # h:mm
|
33
|
+
21 => :time, # h:mm:ss
|
34
|
+
22 => :date_time, # m/d/yy h:mm
|
35
|
+
37 => :unsupported, # #,##0 ;(#,##0)
|
36
|
+
38 => :unsupported, # #,##0 ;[Red](#,##0)
|
37
|
+
39 => :unsupported, # #,##0.00;(#,##0.00)
|
38
|
+
40 => :unsupported, # #,##0.00;[Red](#,##0.00)
|
39
|
+
45 => :time, # mm:ss
|
40
|
+
46 => :time, # [h]:mm:ss
|
41
|
+
47 => :time, # mmss.0
|
42
|
+
48 => :bignum, # ##0.0E+0
|
43
|
+
49 => :unsupported # @
|
44
|
+
}.freeze
|
45
|
+
|
46
|
+
def parse
|
47
|
+
@xml = Nokogiri::XML(file_io.read).remove_namespaces!
|
48
|
+
end
|
49
|
+
|
50
|
+
# Excel doesn't record types for some cells, only its display style, so
|
51
|
+
# we have to back out the type from that style.
|
52
|
+
#
|
53
|
+
# Some of these styles can be determined from a known set (see NumFmtMap),
|
54
|
+
# while others are 'custom' and we have to make a best guess.
|
55
|
+
#
|
56
|
+
# This is the array of types corresponding to the styles a spreadsheet
|
57
|
+
# uses, and includes both the known style types and the custom styles.
|
58
|
+
#
|
59
|
+
# Note that the xml sheet cells that use this don't reference the
|
60
|
+
# numFmtId, but instead the array index of a style in the stored list of
|
61
|
+
# only the styles used in the spreadsheet (which can be either known or
|
62
|
+
# custom). Hence this style types array, rather than a map of numFmtId to
|
63
|
+
# type.
|
64
|
+
def style_types
|
65
|
+
@xml.xpath('/styleSheet/cellXfs/xf').map do |xstyle|
|
66
|
+
style_type_by_num_fmt_id(
|
67
|
+
xstyle.attributes['numFmtId']&.value
|
68
|
+
)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Finds the type we think a style is; For example, fmtId 14 is a date
|
73
|
+
# style, so this would return :date.
|
74
|
+
#
|
75
|
+
# Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
|
76
|
+
# but in practice can sometimes be simply out of the usual "Any Language"
|
77
|
+
# id range that goes up to 49. For example, I have seen a numFmtId of
|
78
|
+
# 59 specified as a date. In Thai, 59 is a number format, so this seems
|
79
|
+
# like a bad idea, but we try to be flexible and just go with it.
|
80
|
+
def style_type_by_num_fmt_id(id)
|
81
|
+
return nil if id.nil?
|
82
|
+
|
83
|
+
id = id.to_i
|
84
|
+
NumFmtMap[id] || custom_style_types[id]
|
85
|
+
end
|
86
|
+
|
87
|
+
# Map of (numFmtId >= 164) (custom styles) to our best guess at the type
|
88
|
+
# ex. {164 => :date_time}
|
89
|
+
def custom_style_types
|
90
|
+
@custom_style_types ||=
|
91
|
+
@xml.xpath('/styleSheet/numFmts/numFmt')
|
92
|
+
.each_with_object({}) do |xstyle, acc|
|
93
|
+
acc[xstyle.attributes['numFmtId'].value.to_i] =
|
94
|
+
determine_custom_style_type(xstyle.attributes['formatCode'].value)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# This is the least deterministic part of reading xlsx files. Due to
|
99
|
+
# custom styles, you can't know for sure when a date is a date other than
|
100
|
+
# looking at its format and gessing. It's not impossible to guess right,
|
101
|
+
# though.
|
102
|
+
#
|
103
|
+
# http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
|
104
|
+
def determine_custom_style_type(string)
|
105
|
+
return :float if string[0] == '_'
|
106
|
+
return :float if string[0] == ' 0'
|
107
|
+
|
108
|
+
# Looks for one of ymdhis outside of meta-stuff like [Red]
|
109
|
+
return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
|
110
|
+
|
111
|
+
:unsupported
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleXlsxReader
|
4
|
+
class Loader
|
5
|
+
WorkbookParser = Struct.new(:file_io) do
|
6
|
+
def self.parse(file_io)
|
7
|
+
parser = new(file_io).tap(&:parse)
|
8
|
+
[parser.sheet_toc, parser.base_date]
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse
|
12
|
+
@xml = Nokogiri::XML(file_io.read).remove_namespaces!
|
13
|
+
end
|
14
|
+
|
15
|
+
# Table of contents for the sheets, ex. {'Authors' => 0, ...}
|
16
|
+
def sheet_toc
|
17
|
+
@xml.xpath('/workbook/sheets/sheet')
|
18
|
+
.each_with_object({}) do |sheet, acc|
|
19
|
+
acc[sheet.attributes['name'].value] =
|
20
|
+
sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
## Returns the base_date from which to calculate dates.
|
25
|
+
# Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
|
26
|
+
# it's set in the Workbook's workbookPr.
|
27
|
+
# http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
|
28
|
+
def base_date
|
29
|
+
return DATE_SYSTEM_1900 if @xml.nil?
|
30
|
+
|
31
|
+
@xml.xpath('//workbook/workbookPr[@date1904]').each do |workbookPr|
|
32
|
+
return DATE_SYSTEM_1904 if workbookPr['date1904'] =~ /true|1/i
|
33
|
+
end
|
34
|
+
|
35
|
+
DATE_SYSTEM_1900
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleXlsxReader
|
4
|
+
class Loader < Struct.new(:file_path)
|
5
|
+
attr_accessor :shared_strings, :sheet_parsers, :sheet_toc, :style_types, :base_date
|
6
|
+
|
7
|
+
def init_sheets
|
8
|
+
ZipReader.new(
|
9
|
+
file_path: file_path,
|
10
|
+
loader: self
|
11
|
+
).read
|
12
|
+
|
13
|
+
sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
|
14
|
+
# sheet_number is *not* the index into xml.sheet_parsers
|
15
|
+
SimpleXlsxReader::Document::Sheet.new(
|
16
|
+
name: sheet_name,
|
17
|
+
sheet_parser: sheet_parsers[i]
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
ZipReader = Struct.new(:file_path, :loader, keyword_init: true) do
|
23
|
+
attr_reader :zip
|
24
|
+
|
25
|
+
def initialize(*args)
|
26
|
+
super
|
27
|
+
@zip = SimpleXlsxReader::Zip.open(file_path)
|
28
|
+
end
|
29
|
+
|
30
|
+
def read
|
31
|
+
entry_at('xl/workbook.xml') do |file_io|
|
32
|
+
loader.sheet_toc, loader.base_date = *WorkbookParser.parse(file_io)
|
33
|
+
end
|
34
|
+
|
35
|
+
entry_at('xl/styles.xml') do |file_io|
|
36
|
+
loader.style_types = StyleTypesParser.parse(file_io)
|
37
|
+
end
|
38
|
+
|
39
|
+
# optional feature used by excel,
|
40
|
+
# but not often used by xlsx generation libraries
|
41
|
+
if (ss_entry = entry_at('xl/sharedStrings.xml'))
|
42
|
+
ss_entry.get_input_stream do |file|
|
43
|
+
loader.shared_strings = SharedStringsParser.parse(file)
|
44
|
+
end
|
45
|
+
else
|
46
|
+
loader.shared_strings = []
|
47
|
+
end
|
48
|
+
|
49
|
+
loader.sheet_parsers = []
|
50
|
+
|
51
|
+
# Sometimes there's a zero-index sheet.xml, ex.
|
52
|
+
# Google Docs creates:
|
53
|
+
# xl/worksheets/sheet.xml
|
54
|
+
# xl/worksheets/sheet1.xml
|
55
|
+
# xl/worksheets/sheet2.xml
|
56
|
+
# While Excel creates:
|
57
|
+
# xl/worksheets/sheet1.xml
|
58
|
+
# xl/worksheets/sheet2.xml
|
59
|
+
add_sheet_parser_at_index(nil)
|
60
|
+
|
61
|
+
i = 1
|
62
|
+
while(add_sheet_parser_at_index(i)) do
|
63
|
+
i += 1
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def entry_at(path, &block)
|
68
|
+
# Older and newer (post-mid-2021) RubyZip normalizes pathnames,
|
69
|
+
# but unfortunately there is a time in between where it doesn't.
|
70
|
+
# Rather than require a specific version, let's just be flexible.
|
71
|
+
entry =
|
72
|
+
zip.find_entry(path) || # *nix-generated
|
73
|
+
zip.find_entry(path.tr('/', '\\')) || # Windows-generated
|
74
|
+
zip.find_entry(path.downcase) || # Sometimes it's lowercase
|
75
|
+
zip.find_entry(path.tr('/', '\\').downcase) # Sometimes it's lowercase
|
76
|
+
|
77
|
+
if block
|
78
|
+
entry.get_input_stream(&block)
|
79
|
+
else
|
80
|
+
entry
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_sheet_parser_at_index(i)
|
85
|
+
sheet_file_name = "xl/worksheets/sheet#{i}.xml"
|
86
|
+
return unless (entry = entry_at(sheet_file_name))
|
87
|
+
|
88
|
+
parser =
|
89
|
+
SheetParser.new(
|
90
|
+
file_io: entry.get_input_stream,
|
91
|
+
loader: loader
|
92
|
+
)
|
93
|
+
|
94
|
+
relationship_file_name = "xl/worksheets/_rels/sheet#{i}.xml.rels"
|
95
|
+
if (rel = entry_at(relationship_file_name))
|
96
|
+
parser.xrels_file = rel.get_input_stream
|
97
|
+
end
|
98
|
+
|
99
|
+
loader.sheet_parsers << parser
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# The heart of typecasting. The ruby type is determined either explicitly
|
105
|
+
# from the cell xml or implicitly from the cell style, and this
|
106
|
+
# method expects that work to have been done already. This, then,
|
107
|
+
# takes the type we determined it to be and casts the cell value
|
108
|
+
# to that type.
|
109
|
+
#
|
110
|
+
# types:
|
111
|
+
# - s: shared string (see #shared_string)
|
112
|
+
# - n: number (cast to a float)
|
113
|
+
# - b: boolean
|
114
|
+
# - str: string
|
115
|
+
# - inlineStr: string
|
116
|
+
# - ruby symbol: for when type has been determined by style
|
117
|
+
#
|
118
|
+
# options:
|
119
|
+
# - shared_strings: needed for 's' (shared string) type
|
120
|
+
def self.cast(value, type, style, options = {})
|
121
|
+
return nil if value.nil? || value.empty?
|
122
|
+
|
123
|
+
# Sometimes the type is dictated by the style alone
|
124
|
+
if type.nil? ||
|
125
|
+
(type == 'n' && %i[date time date_time].include?(style))
|
126
|
+
type = style
|
127
|
+
end
|
128
|
+
|
129
|
+
casted =
|
130
|
+
case type
|
131
|
+
|
132
|
+
##
|
133
|
+
# There are few built-in types
|
134
|
+
##
|
135
|
+
|
136
|
+
when 's' # shared string
|
137
|
+
options[:shared_strings][value.to_i]
|
138
|
+
when 'n' # number
|
139
|
+
value.to_f
|
140
|
+
when 'b'
|
141
|
+
value.to_i == 1
|
142
|
+
when 'str'
|
143
|
+
value
|
144
|
+
when 'inlineStr'
|
145
|
+
value
|
146
|
+
|
147
|
+
##
|
148
|
+
# Type can also be determined by a style,
|
149
|
+
# detected earlier and cast here by its standardized symbol
|
150
|
+
##
|
151
|
+
|
152
|
+
when :string, :unsupported
|
153
|
+
value
|
154
|
+
when :fixnum
|
155
|
+
value.to_i
|
156
|
+
when :float
|
157
|
+
value.to_f
|
158
|
+
when :percentage
|
159
|
+
value.to_f / 100
|
160
|
+
# the trickiest. note that all these formats can vary on
|
161
|
+
# whether they actually contain a date, time, or datetime.
|
162
|
+
when :date, :time, :date_time
|
163
|
+
value = Float(value)
|
164
|
+
days_since_date_system_start = value.to_i
|
165
|
+
fraction_of_24 = value - days_since_date_system_start
|
166
|
+
|
167
|
+
# http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
|
168
|
+
date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
|
169
|
+
|
170
|
+
if fraction_of_24 > 0 # there is a time associated
|
171
|
+
seconds = (fraction_of_24 * 86_400).round
|
172
|
+
return Time.utc(date.year, date.month, date.day) + seconds
|
173
|
+
else
|
174
|
+
return date
|
175
|
+
end
|
176
|
+
when :bignum
|
177
|
+
if defined?(BigDecimal)
|
178
|
+
BigDecimal(value)
|
179
|
+
else
|
180
|
+
value.to_f
|
181
|
+
end
|
182
|
+
|
183
|
+
##
|
184
|
+
# Beats me
|
185
|
+
##
|
186
|
+
|
187
|
+
else
|
188
|
+
value
|
189
|
+
end
|
190
|
+
|
191
|
+
if options[:url]
|
192
|
+
Hyperlink.new(options[:url], casted)
|
193
|
+
else
|
194
|
+
casted
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|