simple_xlsx_reader 1.0.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,256 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
4
+
5
+ module SimpleXlsxReader
6
+ class Loader
7
+ class SheetParser < Nokogiri::XML::SAX::Document
8
+ extend Forwardable
9
+
10
+ attr_accessor :xrels_file
11
+ attr_accessor :hyperlinks_by_cell
12
+
13
+ attr_reader :load_errors
14
+
15
+ def_delegators :@loader, :style_types, :shared_strings, :base_date
16
+
17
+ def initialize(file_io:, loader:)
18
+ @file_io = file_io
19
+ @loader = loader
20
+ end
21
+
22
+ def parse(headers: false, &block)
23
+ raise 'parse called without a block; what should this do?'\
24
+ unless block_given?
25
+
26
+ @headers = headers
27
+ @each_callback = block
28
+ @load_errors = {}
29
+ @current_row_num = nil
30
+ @last_seen_row_idx = 0
31
+ @url = nil # silence warnings
32
+ @function = nil # silence warnings
33
+ @capture = nil # silence warnings
34
+ @dimension = nil # silence warnings
35
+
36
+ @file_io.rewind # in case we've already parsed this once
37
+
38
+ # In this project this is only used for GUI-made hyperlinks (as opposed
39
+ # to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
40
+ # the spreadsheet, and they come AFTER the sheet data. So, solution is
41
+ # to just stream-parse the file twice, first for the hyperlinks at the
42
+ # bottom of the file, then for the file itself. In the future it would
43
+ # be clever to use grep to extract the xml into its own smaller file.
44
+ if xrels_file&.grep(/hyperlink/)&.any?
45
+ xrels_file.rewind
46
+ load_gui_hyperlinks # represented as hyperlinks_by_cell
47
+ @file_io.rewind
48
+ end
49
+
50
+ Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
51
+ end
52
+
53
+ ###
54
+ # SAX document hooks
55
+
56
+ def start_element(name, attrs = [])
57
+ case name
58
+ when 'dimension' then @dimension = attrs.last.last
59
+ when 'row'
60
+ @current_row_num = attrs.find {|(k, v)| k == 'r'}&.last&.to_i
61
+ @current_row = Array.new(column_length)
62
+ when 'c'
63
+ attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
64
+ @cell_name = attrs['r']
65
+ @type = attrs['t']
66
+ @style = attrs['s'] && style_types[attrs['s'].to_i]
67
+ when 'f' then @function = true
68
+ when 'v', 't' then @capture = true
69
+ end
70
+ end
71
+
72
+ def characters(string)
73
+ if @function
74
+ # the only "function" we support is a hyperlink
75
+ @url = string.slice(/HYPERLINK\("(.*?)"/, 1)
76
+ end
77
+
78
+ return unless @capture
79
+
80
+ @current_row[cell_idx] =
81
+ begin
82
+ SimpleXlsxReader::Loader.cast(
83
+ string.strip, @type, @style,
84
+ url: @url || hyperlinks_by_cell&.[](@cell_name),
85
+ shared_strings: shared_strings,
86
+ base_date: base_date
87
+ )
88
+ rescue StandardError => e
89
+ column, row = @cell_name.match(/([A-Z]+)([0-9]+)/).captures
90
+ col_idx = column_letter_to_number(column) - 1
91
+ row_idx = row.to_i - 1
92
+
93
+ if !SimpleXlsxReader.configuration.catch_cell_load_errors
94
+ error = CellLoadError.new(
95
+ "Row #{row_idx}, Col #{col_idx}: #{e.message}"
96
+ )
97
+ error.set_backtrace(e.backtrace)
98
+ raise error
99
+ else
100
+ @load_errors[[row_idx, col_idx]] = e.message
101
+
102
+ string.strip
103
+ end
104
+ end
105
+ end
106
+
107
+ def end_element(name)
108
+ case name
109
+ when 'row'
110
+ if @headers == true # ya a little funky
111
+ @headers = @current_row
112
+ elsif @headers.is_a?(Hash)
113
+ test_headers_hash_against_current_row
114
+ # in case there were empty rows before finding the header
115
+ @last_seen_row_idx = @current_row_num - 1
116
+ elsif @headers.respond_to?(:call)
117
+ @headers = @current_row if @headers.call(@current_row)
118
+ # in case there were empty rows before finding the header
119
+ @last_seen_row_idx = @current_row_num - 1
120
+ elsif @headers
121
+ possibly_yield_empty_rows(headers: true)
122
+ yield_row(@current_row, headers: true)
123
+ else
124
+ possibly_yield_empty_rows(headers: false)
125
+ yield_row(@current_row, headers: false)
126
+ end
127
+
128
+ @last_seen_row_idx += 1
129
+
130
+ # Note that excel writes a '/worksheet/dimension' node we can get
131
+ # this from, but some libs (ex. simple_xlsx_writer) don't record it.
132
+ # In that case, we assume the data is of uniform column length and
133
+ # store the column name of the last header row we see. Obviously this
134
+ # isn't the most robust strategy, but it likely fits 99% of use cases
135
+ # considering it's not a problem with actual excel docs.
136
+ @dimension = "A1:#{@cell_name}" if @dimension.nil?
137
+ when 'v', 't' then @capture = false
138
+ when 'f' then @function = false
139
+ when 'c' then @url = nil
140
+ end
141
+ end
142
+
143
+ ###
144
+ # /End SAX hooks
145
+
146
+ def test_headers_hash_against_current_row
147
+ found = false
148
+
149
+ @current_row.each_with_index do |cell, cell_idx|
150
+ @headers.each_pair do |key, search|
151
+ if search.is_a?(String) ? cell == search : cell&.match?(search)
152
+ found = true
153
+ @current_row[cell_idx] = key
154
+ end
155
+ end
156
+ end
157
+
158
+ @headers = @current_row if found
159
+ end
160
+
161
+ def possibly_yield_empty_rows(headers:)
162
+ while @current_row_num && @current_row_num > @last_seen_row_idx + 1
163
+ @last_seen_row_idx += 1
164
+ yield_row(Array.new(column_length), headers: headers)
165
+ end
166
+ end
167
+
168
+ def yield_row(row, headers:)
169
+ if headers
170
+ @each_callback.call(Hash[@headers.zip(row)])
171
+ else
172
+ @each_callback.call(row)
173
+ end
174
+ end
175
+
176
+ # This sax-parses the whole sheet, just to extract hyperlink refs at the end.
177
+ def load_gui_hyperlinks
178
+ self.hyperlinks_by_cell =
179
+ HyperlinksParser.parse(@file_io, xrels: xrels)
180
+ end
181
+
182
+ class HyperlinksParser < Nokogiri::XML::SAX::Document
183
+ def initialize(file_io, xrels:)
184
+ @file_io = file_io
185
+ @xrels = xrels
186
+ end
187
+
188
+ def self.parse(file_io, xrels:)
189
+ new(file_io, xrels: xrels).parse
190
+ end
191
+
192
+ def parse
193
+ @hyperlinks_by_cell = {}
194
+ Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
195
+ @hyperlinks_by_cell
196
+ end
197
+
198
+ def start_element(name, attrs)
199
+ case name
200
+ when 'hyperlink'
201
+ attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
202
+ id = attrs['id'] || attrs['r:id']
203
+
204
+ @hyperlinks_by_cell[attrs['ref']] =
205
+ @xrels.at_xpath(%(//*[@Id="#{id}"])).attr('Target')
206
+ end
207
+ end
208
+ end
209
+
210
+ def xrels
211
+ @xrels ||= Nokogiri::XML(xrels_file.read) if xrels_file
212
+ end
213
+
214
+ def column_length
215
+ return 0 unless @dimension
216
+
217
+ @column_length ||= column_letter_to_number(last_cell_letter)
218
+ end
219
+
220
+ def cell_idx
221
+ column_letter_to_number(@cell_name.scan(/[A-Z]+/).first) - 1
222
+ end
223
+
224
+ ##
225
+ # Returns the last column name, ex. 'E'
226
+ def last_cell_letter
227
+ return unless @dimension
228
+
229
+ @dimension.scan(/:([A-Z]+)/)&.first&.first || 'A'
230
+ end
231
+
232
+ # formula fits an exponential factorial function of the form:
233
+ # 'A' = 1
234
+ # 'B' = 2
235
+ # 'Z' = 26
236
+ # 'AA' = 26 * 1 + 1
237
+ # 'AZ' = 26 * 1 + 26
238
+ # 'BA' = 26 * 2 + 1
239
+ # 'ZA' = 26 * 26 + 1
240
+ # 'ZZ' = 26 * 26 + 26
241
+ # 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
242
+ # 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
243
+ # 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
244
+ # 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
245
+ def column_letter_to_number(column_letter)
246
+ pow = column_letter.length - 1
247
+ result = 0
248
+ column_letter.each_byte do |b|
249
+ result += 26**pow * (b - 64)
250
+ pow -= 1
251
+ end
252
+ result
253
+ end
254
+ end
255
+ end
256
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleXlsxReader
4
+ class Loader
5
+ StyleTypesParser = Struct.new(:file_io) do
6
+ def self.parse(file_io)
7
+ new(file_io).tap(&:parse).style_types
8
+ end
9
+
10
+ # Map of non-custom numFmtId to casting symbol
11
+ NumFmtMap = {
12
+ 0 => :string, # General
13
+ 1 => :fixnum, # 0
14
+ 2 => :float, # 0.00
15
+ 3 => :fixnum, # #,##0
16
+ 4 => :float, # #,##0.00
17
+ 5 => :unsupported, # $#,##0_);($#,##0)
18
+ 6 => :unsupported, # $#,##0_);[Red]($#,##0)
19
+ 7 => :unsupported, # $#,##0.00_);($#,##0.00)
20
+ 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
21
+ 9 => :percentage, # 0%
22
+ 10 => :percentage, # 0.00%
23
+ 11 => :bignum, # 0.00E+00
24
+ 12 => :unsupported, # # ?/?
25
+ 13 => :unsupported, # # ??/??
26
+ 14 => :date, # mm-dd-yy
27
+ 15 => :date, # d-mmm-yy
28
+ 16 => :date, # d-mmm
29
+ 17 => :date, # mmm-yy
30
+ 18 => :time, # h:mm AM/PM
31
+ 19 => :time, # h:mm:ss AM/PM
32
+ 20 => :time, # h:mm
33
+ 21 => :time, # h:mm:ss
34
+ 22 => :date_time, # m/d/yy h:mm
35
+ 37 => :unsupported, # #,##0 ;(#,##0)
36
+ 38 => :unsupported, # #,##0 ;[Red](#,##0)
37
+ 39 => :unsupported, # #,##0.00;(#,##0.00)
38
+ 40 => :unsupported, # #,##0.00;[Red](#,##0.00)
39
+ 45 => :time, # mm:ss
40
+ 46 => :time, # [h]:mm:ss
41
+ 47 => :time, # mmss.0
42
+ 48 => :bignum, # ##0.0E+0
43
+ 49 => :unsupported # @
44
+ }.freeze
45
+
46
+ def parse
47
+ @xml = Nokogiri::XML(file_io.read).remove_namespaces!
48
+ end
49
+
50
+ # Excel doesn't record types for some cells, only its display style, so
51
+ # we have to back out the type from that style.
52
+ #
53
+ # Some of these styles can be determined from a known set (see NumFmtMap),
54
+ # while others are 'custom' and we have to make a best guess.
55
+ #
56
+ # This is the array of types corresponding to the styles a spreadsheet
57
+ # uses, and includes both the known style types and the custom styles.
58
+ #
59
+ # Note that the xml sheet cells that use this don't reference the
60
+ # numFmtId, but instead the array index of a style in the stored list of
61
+ # only the styles used in the spreadsheet (which can be either known or
62
+ # custom). Hence this style types array, rather than a map of numFmtId to
63
+ # type.
64
+ def style_types
65
+ @xml.xpath('/styleSheet/cellXfs/xf').map do |xstyle|
66
+ style_type_by_num_fmt_id(
67
+ xstyle.attributes['numFmtId']&.value
68
+ )
69
+ end
70
+ end
71
+
72
+ # Finds the type we think a style is; For example, fmtId 14 is a date
73
+ # style, so this would return :date.
74
+ #
75
+ # Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
76
+ # but in practice can sometimes be simply out of the usual "Any Language"
77
+ # id range that goes up to 49. For example, I have seen a numFmtId of
78
+ # 59 specified as a date. In Thai, 59 is a number format, so this seems
79
+ # like a bad idea, but we try to be flexible and just go with it.
80
+ def style_type_by_num_fmt_id(id)
81
+ return nil if id.nil?
82
+
83
+ id = id.to_i
84
+ NumFmtMap[id] || custom_style_types[id]
85
+ end
86
+
87
+ # Map of (numFmtId >= 164) (custom styles) to our best guess at the type
88
+ # ex. {164 => :date_time}
89
+ def custom_style_types
90
+ @custom_style_types ||=
91
+ @xml.xpath('/styleSheet/numFmts/numFmt')
92
+ .each_with_object({}) do |xstyle, acc|
93
+ acc[xstyle.attributes['numFmtId'].value.to_i] =
94
+ determine_custom_style_type(xstyle.attributes['formatCode'].value)
95
+ end
96
+ end
97
+
98
+ # This is the least deterministic part of reading xlsx files. Due to
99
+ # custom styles, you can't know for sure when a date is a date other than
100
+ # looking at its format and gessing. It's not impossible to guess right,
101
+ # though.
102
+ #
103
+ # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
104
+ def determine_custom_style_type(string)
105
+ return :float if string[0] == '_'
106
+ return :float if string[0] == ' 0'
107
+
108
+ # Looks for one of ymdhis outside of meta-stuff like [Red]
109
+ return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
110
+
111
+ :unsupported
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleXlsxReader
4
+ class Loader
5
+ WorkbookParser = Struct.new(:file_io) do
6
+ def self.parse(file_io)
7
+ parser = new(file_io).tap(&:parse)
8
+ [parser.sheet_toc, parser.base_date]
9
+ end
10
+
11
+ def parse
12
+ @xml = Nokogiri::XML(file_io.read).remove_namespaces!
13
+ end
14
+
15
+ # Table of contents for the sheets, ex. {'Authors' => 0, ...}
16
+ def sheet_toc
17
+ @xml.xpath('/workbook/sheets/sheet')
18
+ .each_with_object({}) do |sheet, acc|
19
+ acc[sheet.attributes['name'].value] =
20
+ sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
21
+ end
22
+ end
23
+
24
+ ## Returns the base_date from which to calculate dates.
25
+ # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
26
+ # it's set in the Workbook's workbookPr.
27
+ # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
28
+ def base_date
29
+ return DATE_SYSTEM_1900 if @xml.nil?
30
+
31
+ @xml.xpath('//workbook/workbookPr[@date1904]').each do |workbookPr|
32
+ return DATE_SYSTEM_1904 if workbookPr['date1904'] =~ /true|1/i
33
+ end
34
+
35
+ DATE_SYSTEM_1900
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleXlsxReader
4
+ class Loader < Struct.new(:file_path)
5
+ attr_accessor :shared_strings, :sheet_parsers, :sheet_toc, :style_types, :base_date
6
+
7
+ def init_sheets
8
+ ZipReader.new(
9
+ file_path: file_path,
10
+ loader: self
11
+ ).read
12
+
13
+ sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
14
+ # sheet_number is *not* the index into xml.sheet_parsers
15
+ SimpleXlsxReader::Document::Sheet.new(
16
+ name: sheet_name,
17
+ sheet_parser: sheet_parsers[i]
18
+ )
19
+ end
20
+ end
21
+
22
+ ZipReader = Struct.new(:file_path, :loader, keyword_init: true) do
23
+ attr_reader :zip
24
+
25
+ def initialize(*args)
26
+ super
27
+ @zip = SimpleXlsxReader::Zip.open(file_path)
28
+ end
29
+
30
+ def read
31
+ entry_at('xl/workbook.xml') do |file_io|
32
+ loader.sheet_toc, loader.base_date = *WorkbookParser.parse(file_io)
33
+ end
34
+
35
+ entry_at('xl/styles.xml') do |file_io|
36
+ loader.style_types = StyleTypesParser.parse(file_io)
37
+ end
38
+
39
+ # optional feature used by excel,
40
+ # but not often used by xlsx generation libraries
41
+ if (ss_entry = entry_at('xl/sharedStrings.xml'))
42
+ ss_entry.get_input_stream do |file|
43
+ loader.shared_strings = SharedStringsParser.parse(file)
44
+ end
45
+ else
46
+ loader.shared_strings = []
47
+ end
48
+
49
+ loader.sheet_parsers = []
50
+
51
+ # Sometimes there's a zero-index sheet.xml, ex.
52
+ # Google Docs creates:
53
+ # xl/worksheets/sheet.xml
54
+ # xl/worksheets/sheet1.xml
55
+ # xl/worksheets/sheet2.xml
56
+ # While Excel creates:
57
+ # xl/worksheets/sheet1.xml
58
+ # xl/worksheets/sheet2.xml
59
+ add_sheet_parser_at_index(nil)
60
+
61
+ i = 1
62
+ while(add_sheet_parser_at_index(i)) do
63
+ i += 1
64
+ end
65
+ end
66
+
67
+ def entry_at(path, &block)
68
+ # Older and newer (post-mid-2021) RubyZip normalizes pathnames,
69
+ # but unfortunately there is a time in between where it doesn't.
70
+ # Rather than require a specific version, let's just be flexible.
71
+ entry =
72
+ zip.find_entry(path) || # *nix-generated
73
+ zip.find_entry(path.tr('/', '\\')) || # Windows-generated
74
+ zip.find_entry(path.downcase) || # Sometimes it's lowercase
75
+ zip.find_entry(path.tr('/', '\\').downcase) # Sometimes it's lowercase
76
+
77
+ if block
78
+ entry.get_input_stream(&block)
79
+ else
80
+ entry
81
+ end
82
+ end
83
+
84
+ def add_sheet_parser_at_index(i)
85
+ sheet_file_name = "xl/worksheets/sheet#{i}.xml"
86
+ return unless (entry = entry_at(sheet_file_name))
87
+
88
+ parser =
89
+ SheetParser.new(
90
+ file_io: entry.get_input_stream,
91
+ loader: loader
92
+ )
93
+
94
+ relationship_file_name = "xl/worksheets/_rels/sheet#{i}.xml.rels"
95
+ if (rel = entry_at(relationship_file_name))
96
+ parser.xrels_file = rel.get_input_stream
97
+ end
98
+
99
+ loader.sheet_parsers << parser
100
+ end
101
+ end
102
+
103
+ ##
104
+ # The heart of typecasting. The ruby type is determined either explicitly
105
+ # from the cell xml or implicitly from the cell style, and this
106
+ # method expects that work to have been done already. This, then,
107
+ # takes the type we determined it to be and casts the cell value
108
+ # to that type.
109
+ #
110
+ # types:
111
+ # - s: shared string (see #shared_string)
112
+ # - n: number (cast to a float)
113
+ # - b: boolean
114
+ # - str: string
115
+ # - inlineStr: string
116
+ # - ruby symbol: for when type has been determined by style
117
+ #
118
+ # options:
119
+ # - shared_strings: needed for 's' (shared string) type
120
+ def self.cast(value, type, style, options = {})
121
+ return nil if value.nil? || value.empty?
122
+
123
+ # Sometimes the type is dictated by the style alone
124
+ if type.nil? ||
125
+ (type == 'n' && %i[date time date_time].include?(style))
126
+ type = style
127
+ end
128
+
129
+ casted =
130
+ case type
131
+
132
+ ##
133
+ # There are few built-in types
134
+ ##
135
+
136
+ when 's' # shared string
137
+ options[:shared_strings][value.to_i]
138
+ when 'n' # number
139
+ value.to_f
140
+ when 'b'
141
+ value.to_i == 1
142
+ when 'str'
143
+ value
144
+ when 'inlineStr'
145
+ value
146
+
147
+ ##
148
+ # Type can also be determined by a style,
149
+ # detected earlier and cast here by its standardized symbol
150
+ ##
151
+
152
+ when :string, :unsupported
153
+ value
154
+ when :fixnum
155
+ value.to_i
156
+ when :float
157
+ value.to_f
158
+ when :percentage
159
+ value.to_f / 100
160
+ # the trickiest. note that all these formats can vary on
161
+ # whether they actually contain a date, time, or datetime.
162
+ when :date, :time, :date_time
163
+ value = Float(value)
164
+ days_since_date_system_start = value.to_i
165
+ fraction_of_24 = value - days_since_date_system_start
166
+
167
+ # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
168
+ date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
169
+
170
+ if fraction_of_24 > 0 # there is a time associated
171
+ seconds = (fraction_of_24 * 86_400).round
172
+ return Time.utc(date.year, date.month, date.day) + seconds
173
+ else
174
+ return date
175
+ end
176
+ when :bignum
177
+ if defined?(BigDecimal)
178
+ BigDecimal(value)
179
+ else
180
+ value.to_f
181
+ end
182
+
183
+ ##
184
+ # Beats me
185
+ ##
186
+
187
+ else
188
+ value
189
+ end
190
+
191
+ if options[:url]
192
+ Hyperlink.new(options[:url], casted)
193
+ else
194
+ casted
195
+ end
196
+ end
197
+ end
198
+ end
199
+
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module SimpleXlsxReader
2
- VERSION = "1.0.5"
4
+ VERSION = '2.0.0'
3
5
  end