simple_xlsx_reader 1.0.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,256 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
4
+
5
+ module SimpleXlsxReader
6
+ class Loader
7
+ class SheetParser < Nokogiri::XML::SAX::Document
8
+ extend Forwardable
9
+
10
+ attr_accessor :xrels_file
11
+ attr_accessor :hyperlinks_by_cell
12
+
13
+ attr_reader :load_errors
14
+
15
+ def_delegators :@loader, :style_types, :shared_strings, :base_date
16
+
17
+ def initialize(file_io:, loader:)
18
+ @file_io = file_io
19
+ @loader = loader
20
+ end
21
+
22
+ def parse(headers: false, &block)
23
+ raise 'parse called without a block; what should this do?'\
24
+ unless block_given?
25
+
26
+ @headers = headers
27
+ @each_callback = block
28
+ @load_errors = {}
29
+ @current_row_num = nil
30
+ @last_seen_row_idx = 0
31
+ @url = nil # silence warnings
32
+ @function = nil # silence warnings
33
+ @capture = nil # silence warnings
34
+ @dimension = nil # silence warnings
35
+
36
+ @file_io.rewind # in case we've already parsed this once
37
+
38
+ # In this project this is only used for GUI-made hyperlinks (as opposed
39
+ # to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
40
+ # the spreadsheet, and they come AFTER the sheet data. So, solution is
41
+ # to just stream-parse the file twice, first for the hyperlinks at the
42
+ # bottom of the file, then for the file itself. In the future it would
43
+ # be clever to use grep to extract the xml into its own smaller file.
44
+ if xrels_file&.grep(/hyperlink/)&.any?
45
+ xrels_file.rewind
46
+ load_gui_hyperlinks # represented as hyperlinks_by_cell
47
+ @file_io.rewind
48
+ end
49
+
50
+ Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
51
+ end
52
+
53
+ ###
54
+ # SAX document hooks
55
+
56
+ def start_element(name, attrs = [])
57
+ case name
58
+ when 'dimension' then @dimension = attrs.last.last
59
+ when 'row'
60
+ @current_row_num = attrs.find {|(k, v)| k == 'r'}&.last&.to_i
61
+ @current_row = Array.new(column_length)
62
+ when 'c'
63
+ attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
64
+ @cell_name = attrs['r']
65
+ @type = attrs['t']
66
+ @style = attrs['s'] && style_types[attrs['s'].to_i]
67
+ when 'f' then @function = true
68
+ when 'v', 't' then @capture = true
69
+ end
70
+ end
71
+
72
+ def characters(string)
73
+ if @function
74
+ # the only "function" we support is a hyperlink
75
+ @url = string.slice(/HYPERLINK\("(.*?)"/, 1)
76
+ end
77
+
78
+ return unless @capture
79
+
80
+ @current_row[cell_idx] =
81
+ begin
82
+ SimpleXlsxReader::Loader.cast(
83
+ string.strip, @type, @style,
84
+ url: @url || hyperlinks_by_cell&.[](@cell_name),
85
+ shared_strings: shared_strings,
86
+ base_date: base_date
87
+ )
88
+ rescue StandardError => e
89
+ column, row = @cell_name.match(/([A-Z]+)([0-9]+)/).captures
90
+ col_idx = column_letter_to_number(column) - 1
91
+ row_idx = row.to_i - 1
92
+
93
+ if !SimpleXlsxReader.configuration.catch_cell_load_errors
94
+ error = CellLoadError.new(
95
+ "Row #{row_idx}, Col #{col_idx}: #{e.message}"
96
+ )
97
+ error.set_backtrace(e.backtrace)
98
+ raise error
99
+ else
100
+ @load_errors[[row_idx, col_idx]] = e.message
101
+
102
+ string.strip
103
+ end
104
+ end
105
+ end
106
+
107
+ def end_element(name)
108
+ case name
109
+ when 'row'
110
+ if @headers == true # ya a little funky
111
+ @headers = @current_row
112
+ elsif @headers.is_a?(Hash)
113
+ test_headers_hash_against_current_row
114
+ # in case there were empty rows before finding the header
115
+ @last_seen_row_idx = @current_row_num - 1
116
+ elsif @headers.respond_to?(:call)
117
+ @headers = @current_row if @headers.call(@current_row)
118
+ # in case there were empty rows before finding the header
119
+ @last_seen_row_idx = @current_row_num - 1
120
+ elsif @headers
121
+ possibly_yield_empty_rows(headers: true)
122
+ yield_row(@current_row, headers: true)
123
+ else
124
+ possibly_yield_empty_rows(headers: false)
125
+ yield_row(@current_row, headers: false)
126
+ end
127
+
128
+ @last_seen_row_idx += 1
129
+
130
+ # Note that excel writes a '/worksheet/dimension' node we can get
131
+ # this from, but some libs (ex. simple_xlsx_writer) don't record it.
132
+ # In that case, we assume the data is of uniform column length and
133
+ # store the column name of the last header row we see. Obviously this
134
+ # isn't the most robust strategy, but it likely fits 99% of use cases
135
+ # considering it's not a problem with actual excel docs.
136
+ @dimension = "A1:#{@cell_name}" if @dimension.nil?
137
+ when 'v', 't' then @capture = false
138
+ when 'f' then @function = false
139
+ when 'c' then @url = nil
140
+ end
141
+ end
142
+
143
+ ###
144
+ # /End SAX hooks
145
+
146
+ def test_headers_hash_against_current_row
147
+ found = false
148
+
149
+ @current_row.each_with_index do |cell, cell_idx|
150
+ @headers.each_pair do |key, search|
151
+ if search.is_a?(String) ? cell == search : cell&.match?(search)
152
+ found = true
153
+ @current_row[cell_idx] = key
154
+ end
155
+ end
156
+ end
157
+
158
+ @headers = @current_row if found
159
+ end
160
+
161
+ def possibly_yield_empty_rows(headers:)
162
+ while @current_row_num && @current_row_num > @last_seen_row_idx + 1
163
+ @last_seen_row_idx += 1
164
+ yield_row(Array.new(column_length), headers: headers)
165
+ end
166
+ end
167
+
168
+ def yield_row(row, headers:)
169
+ if headers
170
+ @each_callback.call(Hash[@headers.zip(row)])
171
+ else
172
+ @each_callback.call(row)
173
+ end
174
+ end
175
+
176
+ # This sax-parses the whole sheet, just to extract hyperlink refs at the end.
177
+ def load_gui_hyperlinks
178
+ self.hyperlinks_by_cell =
179
+ HyperlinksParser.parse(@file_io, xrels: xrels)
180
+ end
181
+
182
+ class HyperlinksParser < Nokogiri::XML::SAX::Document
183
+ def initialize(file_io, xrels:)
184
+ @file_io = file_io
185
+ @xrels = xrels
186
+ end
187
+
188
+ def self.parse(file_io, xrels:)
189
+ new(file_io, xrels: xrels).parse
190
+ end
191
+
192
+ def parse
193
+ @hyperlinks_by_cell = {}
194
+ Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
195
+ @hyperlinks_by_cell
196
+ end
197
+
198
+ def start_element(name, attrs)
199
+ case name
200
+ when 'hyperlink'
201
+ attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
202
+ id = attrs['id'] || attrs['r:id']
203
+
204
+ @hyperlinks_by_cell[attrs['ref']] =
205
+ @xrels.at_xpath(%(//*[@Id="#{id}"])).attr('Target')
206
+ end
207
+ end
208
+ end
209
+
210
+ def xrels
211
+ @xrels ||= Nokogiri::XML(xrels_file.read) if xrels_file
212
+ end
213
+
214
+ def column_length
215
+ return 0 unless @dimension
216
+
217
+ @column_length ||= column_letter_to_number(last_cell_letter)
218
+ end
219
+
220
+ def cell_idx
221
+ column_letter_to_number(@cell_name.scan(/[A-Z]+/).first) - 1
222
+ end
223
+
224
+ ##
225
+ # Returns the last column name, ex. 'E'
226
+ def last_cell_letter
227
+ return unless @dimension
228
+
229
+ @dimension.scan(/:([A-Z]+)/)&.first&.first || 'A'
230
+ end
231
+
232
+ # formula fits an exponential factorial function of the form:
233
+ # 'A' = 1
234
+ # 'B' = 2
235
+ # 'Z' = 26
236
+ # 'AA' = 26 * 1 + 1
237
+ # 'AZ' = 26 * 1 + 26
238
+ # 'BA' = 26 * 2 + 1
239
+ # 'ZA' = 26 * 26 + 1
240
+ # 'ZZ' = 26 * 26 + 26
241
+ # 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
242
+ # 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
243
+ # 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
244
+ # 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
245
+ def column_letter_to_number(column_letter)
246
+ pow = column_letter.length - 1
247
+ result = 0
248
+ column_letter.each_byte do |b|
249
+ result += 26**pow * (b - 64)
250
+ pow -= 1
251
+ end
252
+ result
253
+ end
254
+ end
255
+ end
256
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleXlsxReader
4
+ class Loader
5
+ StyleTypesParser = Struct.new(:file_io) do
6
+ def self.parse(file_io)
7
+ new(file_io).tap(&:parse).style_types
8
+ end
9
+
10
+ # Map of non-custom numFmtId to casting symbol
11
+ NumFmtMap = {
12
+ 0 => :string, # General
13
+ 1 => :fixnum, # 0
14
+ 2 => :float, # 0.00
15
+ 3 => :fixnum, # #,##0
16
+ 4 => :float, # #,##0.00
17
+ 5 => :unsupported, # $#,##0_);($#,##0)
18
+ 6 => :unsupported, # $#,##0_);[Red]($#,##0)
19
+ 7 => :unsupported, # $#,##0.00_);($#,##0.00)
20
+ 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
21
+ 9 => :percentage, # 0%
22
+ 10 => :percentage, # 0.00%
23
+ 11 => :bignum, # 0.00E+00
24
+ 12 => :unsupported, # # ?/?
25
+ 13 => :unsupported, # # ??/??
26
+ 14 => :date, # mm-dd-yy
27
+ 15 => :date, # d-mmm-yy
28
+ 16 => :date, # d-mmm
29
+ 17 => :date, # mmm-yy
30
+ 18 => :time, # h:mm AM/PM
31
+ 19 => :time, # h:mm:ss AM/PM
32
+ 20 => :time, # h:mm
33
+ 21 => :time, # h:mm:ss
34
+ 22 => :date_time, # m/d/yy h:mm
35
+ 37 => :unsupported, # #,##0 ;(#,##0)
36
+ 38 => :unsupported, # #,##0 ;[Red](#,##0)
37
+ 39 => :unsupported, # #,##0.00;(#,##0.00)
38
+ 40 => :unsupported, # #,##0.00;[Red](#,##0.00)
39
+ 45 => :time, # mm:ss
40
+ 46 => :time, # [h]:mm:ss
41
+ 47 => :time, # mmss.0
42
+ 48 => :bignum, # ##0.0E+0
43
+ 49 => :unsupported # @
44
+ }.freeze
45
+
46
+ def parse
47
+ @xml = Nokogiri::XML(file_io.read).remove_namespaces!
48
+ end
49
+
50
+ # Excel doesn't record types for some cells, only its display style, so
51
+ # we have to back out the type from that style.
52
+ #
53
+ # Some of these styles can be determined from a known set (see NumFmtMap),
54
+ # while others are 'custom' and we have to make a best guess.
55
+ #
56
+ # This is the array of types corresponding to the styles a spreadsheet
57
+ # uses, and includes both the known style types and the custom styles.
58
+ #
59
+ # Note that the xml sheet cells that use this don't reference the
60
+ # numFmtId, but instead the array index of a style in the stored list of
61
+ # only the styles used in the spreadsheet (which can be either known or
62
+ # custom). Hence this style types array, rather than a map of numFmtId to
63
+ # type.
64
+ def style_types
65
+ @xml.xpath('/styleSheet/cellXfs/xf').map do |xstyle|
66
+ style_type_by_num_fmt_id(
67
+ xstyle.attributes['numFmtId']&.value
68
+ )
69
+ end
70
+ end
71
+
72
+ # Finds the type we think a style is; For example, fmtId 14 is a date
73
+ # style, so this would return :date.
74
+ #
75
+ # Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
76
+ # but in practice can sometimes be simply out of the usual "Any Language"
77
+ # id range that goes up to 49. For example, I have seen a numFmtId of
78
+ # 59 specified as a date. In Thai, 59 is a number format, so this seems
79
+ # like a bad idea, but we try to be flexible and just go with it.
80
+ def style_type_by_num_fmt_id(id)
81
+ return nil if id.nil?
82
+
83
+ id = id.to_i
84
+ NumFmtMap[id] || custom_style_types[id]
85
+ end
86
+
87
+ # Map of (numFmtId >= 164) (custom styles) to our best guess at the type
88
+ # ex. {164 => :date_time}
89
+ def custom_style_types
90
+ @custom_style_types ||=
91
+ @xml.xpath('/styleSheet/numFmts/numFmt')
92
+ .each_with_object({}) do |xstyle, acc|
93
+ acc[xstyle.attributes['numFmtId'].value.to_i] =
94
+ determine_custom_style_type(xstyle.attributes['formatCode'].value)
95
+ end
96
+ end
97
+
98
+ # This is the least deterministic part of reading xlsx files. Due to
99
+ # custom styles, you can't know for sure when a date is a date other than
100
+ # looking at its format and gessing. It's not impossible to guess right,
101
+ # though.
102
+ #
103
+ # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
104
+ def determine_custom_style_type(string)
105
+ return :float if string[0] == '_'
106
+ return :float if string[0] == ' 0'
107
+
108
+ # Looks for one of ymdhis outside of meta-stuff like [Red]
109
+ return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
110
+
111
+ :unsupported
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleXlsxReader
4
+ class Loader
5
+ WorkbookParser = Struct.new(:file_io) do
6
+ def self.parse(file_io)
7
+ parser = new(file_io).tap(&:parse)
8
+ [parser.sheet_toc, parser.base_date]
9
+ end
10
+
11
+ def parse
12
+ @xml = Nokogiri::XML(file_io.read).remove_namespaces!
13
+ end
14
+
15
+ # Table of contents for the sheets, ex. {'Authors' => 0, ...}
16
+ def sheet_toc
17
+ @xml.xpath('/workbook/sheets/sheet')
18
+ .each_with_object({}) do |sheet, acc|
19
+ acc[sheet.attributes['name'].value] =
20
+ sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
21
+ end
22
+ end
23
+
24
+ ## Returns the base_date from which to calculate dates.
25
+ # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
26
+ # it's set in the Workbook's workbookPr.
27
+ # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
28
+ def base_date
29
+ return DATE_SYSTEM_1900 if @xml.nil?
30
+
31
+ @xml.xpath('//workbook/workbookPr[@date1904]').each do |workbookPr|
32
+ return DATE_SYSTEM_1904 if workbookPr['date1904'] =~ /true|1/i
33
+ end
34
+
35
+ DATE_SYSTEM_1900
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleXlsxReader
4
+ class Loader < Struct.new(:file_path)
5
+ attr_accessor :shared_strings, :sheet_parsers, :sheet_toc, :style_types, :base_date
6
+
7
+ def init_sheets
8
+ ZipReader.new(
9
+ file_path: file_path,
10
+ loader: self
11
+ ).read
12
+
13
+ sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
14
+ # sheet_number is *not* the index into xml.sheet_parsers
15
+ SimpleXlsxReader::Document::Sheet.new(
16
+ name: sheet_name,
17
+ sheet_parser: sheet_parsers[i]
18
+ )
19
+ end
20
+ end
21
+
22
+ ZipReader = Struct.new(:file_path, :loader, keyword_init: true) do
23
+ attr_reader :zip
24
+
25
+ def initialize(*args)
26
+ super
27
+ @zip = SimpleXlsxReader::Zip.open(file_path)
28
+ end
29
+
30
+ def read
31
+ entry_at('xl/workbook.xml') do |file_io|
32
+ loader.sheet_toc, loader.base_date = *WorkbookParser.parse(file_io)
33
+ end
34
+
35
+ entry_at('xl/styles.xml') do |file_io|
36
+ loader.style_types = StyleTypesParser.parse(file_io)
37
+ end
38
+
39
+ # optional feature used by excel,
40
+ # but not often used by xlsx generation libraries
41
+ if (ss_entry = entry_at('xl/sharedStrings.xml'))
42
+ ss_entry.get_input_stream do |file|
43
+ loader.shared_strings = SharedStringsParser.parse(file)
44
+ end
45
+ else
46
+ loader.shared_strings = []
47
+ end
48
+
49
+ loader.sheet_parsers = []
50
+
51
+ # Sometimes there's a zero-index sheet.xml, ex.
52
+ # Google Docs creates:
53
+ # xl/worksheets/sheet.xml
54
+ # xl/worksheets/sheet1.xml
55
+ # xl/worksheets/sheet2.xml
56
+ # While Excel creates:
57
+ # xl/worksheets/sheet1.xml
58
+ # xl/worksheets/sheet2.xml
59
+ add_sheet_parser_at_index(nil)
60
+
61
+ i = 1
62
+ while(add_sheet_parser_at_index(i)) do
63
+ i += 1
64
+ end
65
+ end
66
+
67
+ def entry_at(path, &block)
68
+ # Older and newer (post-mid-2021) RubyZip normalizes pathnames,
69
+ # but unfortunately there is a time in between where it doesn't.
70
+ # Rather than require a specific version, let's just be flexible.
71
+ entry =
72
+ zip.find_entry(path) || # *nix-generated
73
+ zip.find_entry(path.tr('/', '\\')) || # Windows-generated
74
+ zip.find_entry(path.downcase) || # Sometimes it's lowercase
75
+ zip.find_entry(path.tr('/', '\\').downcase) # Sometimes it's lowercase
76
+
77
+ if block
78
+ entry.get_input_stream(&block)
79
+ else
80
+ entry
81
+ end
82
+ end
83
+
84
+ def add_sheet_parser_at_index(i)
85
+ sheet_file_name = "xl/worksheets/sheet#{i}.xml"
86
+ return unless (entry = entry_at(sheet_file_name))
87
+
88
+ parser =
89
+ SheetParser.new(
90
+ file_io: entry.get_input_stream,
91
+ loader: loader
92
+ )
93
+
94
+ relationship_file_name = "xl/worksheets/_rels/sheet#{i}.xml.rels"
95
+ if (rel = entry_at(relationship_file_name))
96
+ parser.xrels_file = rel.get_input_stream
97
+ end
98
+
99
+ loader.sheet_parsers << parser
100
+ end
101
+ end
102
+
103
+ ##
104
+ # The heart of typecasting. The ruby type is determined either explicitly
105
+ # from the cell xml or implicitly from the cell style, and this
106
+ # method expects that work to have been done already. This, then,
107
+ # takes the type we determined it to be and casts the cell value
108
+ # to that type.
109
+ #
110
+ # types:
111
+ # - s: shared string (see #shared_string)
112
+ # - n: number (cast to a float)
113
+ # - b: boolean
114
+ # - str: string
115
+ # - inlineStr: string
116
+ # - ruby symbol: for when type has been determined by style
117
+ #
118
+ # options:
119
+ # - shared_strings: needed for 's' (shared string) type
120
+ def self.cast(value, type, style, options = {})
121
+ return nil if value.nil? || value.empty?
122
+
123
+ # Sometimes the type is dictated by the style alone
124
+ if type.nil? ||
125
+ (type == 'n' && %i[date time date_time].include?(style))
126
+ type = style
127
+ end
128
+
129
+ casted =
130
+ case type
131
+
132
+ ##
133
+ # There are few built-in types
134
+ ##
135
+
136
+ when 's' # shared string
137
+ options[:shared_strings][value.to_i]
138
+ when 'n' # number
139
+ value.to_f
140
+ when 'b'
141
+ value.to_i == 1
142
+ when 'str'
143
+ value
144
+ when 'inlineStr'
145
+ value
146
+
147
+ ##
148
+ # Type can also be determined by a style,
149
+ # detected earlier and cast here by its standardized symbol
150
+ ##
151
+
152
+ when :string, :unsupported
153
+ value
154
+ when :fixnum
155
+ value.to_i
156
+ when :float
157
+ value.to_f
158
+ when :percentage
159
+ value.to_f / 100
160
+ # the trickiest. note that all these formats can vary on
161
+ # whether they actually contain a date, time, or datetime.
162
+ when :date, :time, :date_time
163
+ value = Float(value)
164
+ days_since_date_system_start = value.to_i
165
+ fraction_of_24 = value - days_since_date_system_start
166
+
167
+ # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
168
+ date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
169
+
170
+ if fraction_of_24 > 0 # there is a time associated
171
+ seconds = (fraction_of_24 * 86_400).round
172
+ return Time.utc(date.year, date.month, date.day) + seconds
173
+ else
174
+ return date
175
+ end
176
+ when :bignum
177
+ if defined?(BigDecimal)
178
+ BigDecimal(value)
179
+ else
180
+ value.to_f
181
+ end
182
+
183
+ ##
184
+ # Beats me
185
+ ##
186
+
187
+ else
188
+ value
189
+ end
190
+
191
+ if options[:url]
192
+ Hyperlink.new(options[:url], casted)
193
+ else
194
+ casted
195
+ end
196
+ end
197
+ end
198
+ end
199
+
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module SimpleXlsxReader
2
- VERSION = "1.0.5"
4
+ VERSION = '2.0.0'
3
5
  end