simple_xlsx_reader 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,18 @@
1
- require "simple_xlsx_reader/version"
1
+ # frozen_string_literal: true
2
+
2
3
  require 'nokogiri'
3
4
  require 'date'
4
5
 
6
+ require 'simple_xlsx_reader/version'
7
+ require 'simple_xlsx_reader/hyperlink'
8
+ require 'simple_xlsx_reader/document'
9
+ require 'simple_xlsx_reader/loader'
10
+ require 'simple_xlsx_reader/loader/workbook_parser'
11
+ require 'simple_xlsx_reader/loader/shared_strings_parser'
12
+ require 'simple_xlsx_reader/loader/sheet_parser'
13
+ require 'simple_xlsx_reader/loader/style_types_parser'
14
+
15
+
5
16
  # Rubyzip 1.0 only has different naming, everything else is the same, so let's
6
17
  # be flexible so we don't force people into a dependency hell w/ other gems.
7
18
  begin
@@ -17,452 +28,22 @@ rescue LoadError
17
28
  end
18
29
 
19
30
  module SimpleXlsxReader
20
- class CellLoadError < StandardError; end
21
-
22
- def self.configuration
23
- @configuration ||= Struct.new(:catch_cell_load_errors).new.tap do |c|
24
- c.catch_cell_load_errors = false
25
- end
26
- end
27
-
28
- def self.open(file_path)
29
- Document.new(file_path).tap(&:sheets)
30
- end
31
-
32
- class Document
33
- attr_reader :file_path
34
-
35
- def initialize(file_path)
36
- @file_path = file_path
37
- end
38
-
39
- def sheets
40
- @sheets ||= Mapper.new(xml).load_sheets
41
- end
31
+ DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
32
+ DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
42
33
 
43
- def to_hash
44
- sheets.inject({}) {|acc, sheet| acc[sheet.name] = sheet.rows; acc}
45
- end
46
-
47
- def xml
48
- Xml.load(file_path)
49
- end
50
-
51
- class Sheet < Struct.new(:name, :rows)
52
- def headers
53
- rows[0]
54
- end
55
-
56
- def data
57
- rows[1..-1]
58
- end
34
+ class CellLoadError < StandardError; end
59
35
 
60
- # Load errors will be a hash of the form:
61
- # {
62
- # [rownum, colnum] => '[error]'
63
- # }
64
- def load_errors
65
- @load_errors ||= {}
36
+ class << self
37
+ def configuration
38
+ @configuration ||= Struct.new(:catch_cell_load_errors, :auto_slurp).new.tap do |c|
39
+ c.catch_cell_load_errors = false
40
+ c.auto_slurp = false
66
41
  end
67
42
  end
68
43
 
69
- ##
70
- # For internal use; stores source xml in nokogiri documents
71
- class Xml
72
- attr_accessor :workbook, :shared_strings, :sheets, :styles
73
-
74
- def self.load(file_path)
75
- self.new.tap do |xml|
76
- SimpleXlsxReader::Zip.open(file_path) do |zip|
77
- xml.workbook = Nokogiri::XML(zip.read('xl/workbook.xml')).remove_namespaces!
78
- xml.styles = Nokogiri::XML(zip.read('xl/styles.xml')).remove_namespaces!
79
-
80
- # optional feature used by excel, but not often used by xlsx
81
- # generation libraries
82
- ss_file = (zip.to_a.map(&:name) & ['xl/sharedStrings.xml','xl/sharedstrings.xml'])[0]
83
- if ss_file
84
- xml.shared_strings = Nokogiri::XML(zip.read(ss_file)).remove_namespaces!
85
- end
86
-
87
- xml.sheets = []
88
- i = 0
89
- loop do
90
- i += 1
91
- break if !zip.file.file?("xl/worksheets/sheet#{i}.xml")
92
-
93
- xml.sheets <<
94
- Nokogiri::XML(zip.read("xl/worksheets/sheet#{i}.xml")).remove_namespaces!
95
- end
96
- end
97
- end
98
- end
44
+ def open(file_path)
45
+ Document.new(file_path).tap(&:sheets)
99
46
  end
100
-
101
- ##
102
- # For internal use; translates source xml to Sheet objects.
103
- class Mapper < Struct.new(:xml)
104
- DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
105
- DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
106
-
107
- def load_sheets
108
- sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
109
- parse_sheet(sheet_name, xml.sheets[i]) # sheet_number is *not* the index into xml.sheets
110
- end
111
- end
112
-
113
- # Table of contents for the sheets, ex. {'Authors' => 0, ...}
114
- def sheet_toc
115
- xml.workbook.xpath('/workbook/sheets/sheet').
116
- inject({}) do |acc, sheet|
117
-
118
- acc[sheet.attributes['name'].value] =
119
- sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
120
-
121
- acc
122
- end
123
- end
124
-
125
- def parse_sheet(sheet_name, xsheet)
126
- sheet = Sheet.new(sheet_name)
127
- sheet_width, sheet_height = *sheet_dimensions(xsheet)
128
-
129
- sheet.rows = Array.new(sheet_height) { Array.new(sheet_width) }
130
- xsheet.xpath("/worksheet/sheetData/row/c").each do |xcell|
131
- column, row = *xcell.attr('r').match(/([A-Z]+)([0-9]+)/).captures
132
- col_idx = column_letter_to_number(column) - 1
133
- row_idx = row.to_i - 1
134
-
135
- type = xcell.attributes['t'] &&
136
- xcell.attributes['t'].value
137
- style = xcell.attributes['s'] &&
138
- style_types[xcell.attributes['s'].value.to_i]
139
-
140
- # This is the main performance bottleneck. Using just 'xcell.text'
141
- # would be ideal, and makes parsing super-fast. However, there's
142
- # other junk in the cell, formula references in particular,
143
- # so we really do have to look for specific value nodes.
144
- # Maybe there is a really clever way to use xcell.text and parse out
145
- # the correct value, but I can't think of one, or an alternative
146
- # strategy.
147
- #
148
- # And yes, this really is faster than using xcell.at_xpath(...),
149
- # by about 60%. Odd.
150
- xvalue = type == 'inlineStr' ?
151
- (xis = xcell.children.find {|c| c.name == 'is'}) && xis.children.find {|c| c.name == 't'} :
152
- xcell.children.find {|c| c.name == 'v'}
153
-
154
- cell = begin
155
- self.class.cast(xvalue && xvalue.text.strip, type, style,
156
- :shared_strings => shared_strings,
157
- :base_date => base_date)
158
- rescue => e
159
- if !SimpleXlsxReader.configuration.catch_cell_load_errors
160
- error = CellLoadError.new(
161
- "Row #{row_idx}, Col #{col_idx}: #{e.message}")
162
- error.set_backtrace(e.backtrace)
163
- raise error
164
- else
165
- sheet.load_errors[[row_idx, col_idx]] = e.message
166
-
167
- xcell.text.strip
168
- end
169
- end
170
-
171
- # This shouldn't be necessary, but just in case, we'll create
172
- # the row so we don't blow up. This means any null rows in between
173
- # will be null instead of [null, null, ...]
174
- sheet.rows[row_idx] ||= Array.new(sheet_width)
175
-
176
- sheet.rows[row_idx][col_idx] = cell
177
- end
178
-
179
- sheet
180
- end
181
-
182
- ##
183
- # Returns the last column name, ex. 'E'
184
- #
185
- # Note that excel writes a '/worksheet/dimension' node we can get the
186
- # last cell from, but some libs (ex. simple_xlsx_writer) don't record
187
- # this. In that case, we assume the data is of uniform column length
188
- # and check the column name of the last header row. Obviously this isn't
189
- # the most robust strategy, but it likely fits 99% of use cases
190
- # considering it's not a problem with actual excel docs.
191
- def last_cell_label(xsheet)
192
- dimension = xsheet.at_xpath('/worksheet/dimension')
193
- if dimension
194
- col = dimension.attributes['ref'].value.match(/:([A-Z]+[0-9]+)/)
195
- col ? col.captures.first : 'A1'
196
- else
197
- last = xsheet.at_xpath("/worksheet/sheetData/row[last()]/c[last()]")
198
- last ? last.attributes['r'].value.match(/([A-Z]+[0-9]+)/).captures.first : 'A1'
199
- end
200
- end
201
-
202
- # Returns dimensions (1-indexed)
203
- def sheet_dimensions(xsheet)
204
- column, row = *last_cell_label(xsheet).match(/([A-Z]+)([0-9]+)/).captures
205
- [column_letter_to_number(column), row.to_i]
206
- end
207
-
208
- # formula fits an exponential factorial function of the form:
209
- # 'A' = 1
210
- # 'B' = 2
211
- # 'Z' = 26
212
- # 'AA' = 26 * 1 + 1
213
- # 'AZ' = 26 * 1 + 26
214
- # 'BA' = 26 * 2 + 1
215
- # 'ZA' = 26 * 26 + 1
216
- # 'ZZ' = 26 * 26 + 26
217
- # 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
218
- # 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
219
- # 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
220
- # 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
221
- def column_letter_to_number(column_letter)
222
- pow = column_letter.length - 1
223
- result = 0
224
- column_letter.each_byte do |b|
225
- result += 26**pow * (b - 64)
226
- pow -= 1
227
- end
228
- result
229
- end
230
-
231
- # Excel doesn't record types for some cells, only its display style, so
232
- # we have to back out the type from that style.
233
- #
234
- # Some of these styles can be determined from a known set (see NumFmtMap),
235
- # while others are 'custom' and we have to make a best guess.
236
- #
237
- # This is the array of types corresponding to the styles a spreadsheet
238
- # uses, and includes both the known style types and the custom styles.
239
- #
240
- # Note that the xml sheet cells that use this don't reference the
241
- # numFmtId, but instead the array index of a style in the stored list of
242
- # only the styles used in the spreadsheet (which can be either known or
243
- # custom). Hence this style types array, rather than a map of numFmtId to
244
- # type.
245
- def style_types
246
- @style_types ||=
247
- xml.styles.xpath('/styleSheet/cellXfs/xf').map {|xstyle|
248
- style_type_by_num_fmt_id(num_fmt_id(xstyle))}
249
- end
250
-
251
- #returns the numFmtId value if it's available
252
- def num_fmt_id(xstyle)
253
- if xstyle.attributes['numFmtId']
254
- xstyle.attributes['numFmtId'].value
255
- else
256
- nil
257
- end
258
- end
259
-
260
- # Finds the type we think a style is; For example, fmtId 14 is a date
261
- # style, so this would return :date.
262
- #
263
- # Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
264
- # but in practice can sometimes be simply out of the usual "Any Language"
265
- # id range that goes up to 49. For example, I have seen a numFmtId of
266
- # 59 specified as a date. In Thai, 59 is a number format, so this seems
267
- # like a bad idea, but we try to be flexible and just go with it.
268
- def style_type_by_num_fmt_id(id)
269
- return nil if id.nil?
270
-
271
- id = id.to_i
272
- NumFmtMap[id] || custom_style_types[id]
273
- end
274
-
275
- # Map of (numFmtId >= 164) (custom styles) to our best guess at the type
276
- # ex. {164 => :date_time}
277
- def custom_style_types
278
- @custom_style_types ||=
279
- xml.styles.xpath('/styleSheet/numFmts/numFmt').
280
- inject({}) do |acc, xstyle|
281
-
282
- acc[xstyle.attributes['numFmtId'].value.to_i] =
283
- determine_custom_style_type(xstyle.attributes['formatCode'].value)
284
-
285
- acc
286
- end
287
- end
288
-
289
- # This is the least deterministic part of reading xlsx files. Due to
290
- # custom styles, you can't know for sure when a date is a date other than
291
- # looking at its format and gessing. It's not impossible to guess right,
292
- # though.
293
- #
294
- # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
295
- def determine_custom_style_type(string)
296
- return :float if string[0] == '_'
297
- return :float if string[0] == ' 0'
298
-
299
- # Looks for one of ymdhis outside of meta-stuff like [Red]
300
- return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
301
-
302
- return :unsupported
303
- end
304
-
305
- ##
306
- # The heart of typecasting. The ruby type is determined either explicitly
307
- # from the cell xml or implicitly from the cell style, and this
308
- # method expects that work to have been done already. This, then,
309
- # takes the type we determined it to be and casts the cell value
310
- # to that type.
311
- #
312
- # types:
313
- # - s: shared string (see #shared_string)
314
- # - n: number (cast to a float)
315
- # - b: boolean
316
- # - str: string
317
- # - inlineStr: string
318
- # - ruby symbol: for when type has been determined by style
319
- #
320
- # options:
321
- # - shared_strings: needed for 's' (shared string) type
322
- def self.cast(value, type, style, options = {})
323
- return nil if value.nil? || value.empty?
324
-
325
- # Sometimes the type is dictated by the style alone
326
- if type.nil? ||
327
- (type == 'n' && [:date, :time, :date_time].include?(style))
328
- type = style
329
- end
330
-
331
- case type
332
-
333
- ##
334
- # There are few built-in types
335
- ##
336
-
337
- when 's' # shared string
338
- options[:shared_strings][value.to_i]
339
- when 'n' # number
340
- value.to_f
341
- when 'b'
342
- value.to_i == 1
343
- when 'str'
344
- value
345
- when 'inlineStr'
346
- value
347
-
348
- ##
349
- # Type can also be determined by a style,
350
- # detected earlier and cast here by its standardized symbol
351
- ##
352
-
353
- when :string, :unsupported
354
- value
355
- when :fixnum
356
- value.to_i
357
- when :float
358
- value.to_f
359
- when :percentage
360
- value.to_f / 100
361
- # the trickiest. note that all these formats can vary on
362
- # whether they actually contain a date, time, or datetime.
363
- when :date, :time, :date_time
364
- value = value.to_f
365
- days_since_date_system_start = value.to_i
366
- fraction_of_24 = value - days_since_date_system_start
367
-
368
- # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
369
- date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
370
-
371
- if fraction_of_24 > 0 # there is a time associated
372
- seconds = (fraction_of_24 * 86400).round
373
- return Time.utc(date.year, date.month, date.day) + seconds
374
- else
375
- return date
376
- end
377
- when :bignum
378
- if defined?(BigDecimal)
379
- BigDecimal.new(value)
380
- else
381
- value.to_f
382
- end
383
-
384
- ##
385
- # Beats me
386
- ##
387
-
388
- else
389
- value
390
- end
391
- end
392
-
393
- ## Returns the base_date from which to calculate dates.
394
- # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
395
- # it's set in the Workbook's workbookPr.
396
- # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
397
- def base_date
398
- @base_date ||=
399
- begin
400
- return DATE_SYSTEM_1900 if xml.workbook == nil
401
- xml.workbook.xpath("//workbook/workbookPr[@date1904]").each do |workbookPr|
402
- return DATE_SYSTEM_1904 if workbookPr["date1904"] =~ /true|1/i
403
- end
404
- DATE_SYSTEM_1900
405
- end
406
- end
407
-
408
- # Map of non-custom numFmtId to casting symbol
409
- NumFmtMap = {
410
- 0 => :string, # General
411
- 1 => :fixnum, # 0
412
- 2 => :float, # 0.00
413
- 3 => :fixnum, # #,##0
414
- 4 => :float, # #,##0.00
415
- 5 => :unsupported, # $#,##0_);($#,##0)
416
- 6 => :unsupported, # $#,##0_);[Red]($#,##0)
417
- 7 => :unsupported, # $#,##0.00_);($#,##0.00)
418
- 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
419
- 9 => :percentage, # 0%
420
- 10 => :percentage, # 0.00%
421
- 11 => :bignum, # 0.00E+00
422
- 12 => :unsupported, # # ?/?
423
- 13 => :unsupported, # # ??/??
424
- 14 => :date, # mm-dd-yy
425
- 15 => :date, # d-mmm-yy
426
- 16 => :date, # d-mmm
427
- 17 => :date, # mmm-yy
428
- 18 => :time, # h:mm AM/PM
429
- 19 => :time, # h:mm:ss AM/PM
430
- 20 => :time, # h:mm
431
- 21 => :time, # h:mm:ss
432
- 22 => :date_time, # m/d/yy h:mm
433
- 37 => :unsupported, # #,##0 ;(#,##0)
434
- 38 => :unsupported, # #,##0 ;[Red](#,##0)
435
- 39 => :unsupported, # #,##0.00;(#,##0.00)
436
- 40 => :unsupported, # #,##0.00;[Red](#,##0.00)
437
- 45 => :time, # mm:ss
438
- 46 => :time, # [h]:mm:ss
439
- 47 => :time, # mmss.0
440
- 48 => :bignum, # ##0.0E+0
441
- 49 => :unsupported # @
442
- }
443
-
444
- # For performance reasons, excel uses an optional SpreadsheetML feature
445
- # that puts all strings in a separate xml file, and then references
446
- # them by their index in that file.
447
- #
448
- # http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
449
- def shared_strings
450
- @shared_strings ||= begin
451
- if xml.shared_strings
452
- xml.shared_strings.xpath('/sst/si').map do |xsst|
453
- # a shared string can be a single value...
454
- sst = xsst.at_xpath('t/text()')
455
- sst = sst.text if sst
456
- # ... or a composite of seperately styled words/characters
457
- sst ||= xsst.xpath('r/t/text()').map(&:text).join
458
- end
459
- else
460
- []
461
- end
462
- end
463
- end
464
-
465
- end
466
-
47
+ alias parse open
467
48
  end
468
49
  end
@@ -7,19 +7,21 @@ Gem::Specification.new do |gem|
7
7
  gem.name = "simple_xlsx_reader"
8
8
  gem.version = SimpleXlsxReader::VERSION
9
9
  gem.authors = ["Woody Peterson"]
10
- gem.email = ["woody@sigby.com"]
10
+ gem.email = ["woody.peterson@gmail.com"]
11
11
  gem.description = %q{Read xlsx data the Ruby way}
12
12
  gem.summary = %q{Read xlsx data the Ruby way}
13
13
  gem.homepage = ""
14
+ gem.license = "MIT"
14
15
 
15
16
  gem.add_dependency 'nokogiri'
16
17
  gem.add_dependency 'rubyzip'
17
18
 
18
19
  gem.add_development_dependency 'minitest', '>= 5.0'
20
+ gem.add_development_dependency 'rake'
19
21
  gem.add_development_dependency 'pry'
20
22
 
21
23
  gem.files = `git ls-files`.split($/)
22
24
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
23
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
25
+ gem.test_files = gem.files.grep(%r{^test/})
24
26
  gem.require_paths = ["lib"]
25
27
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
 
3
5
  describe SimpleXlsxReader do
@@ -5,9 +7,8 @@ describe SimpleXlsxReader do
5
7
  let(:subject) { SimpleXlsxReader::Document.new(date1904_file) }
6
8
 
7
9
  it 'supports converting dates with the 1904 date system' do
8
- subject.to_hash.must_equal({
9
- "date1904" => [[Date.parse("2014-05-01")]]
10
- })
10
+ _(subject.to_hash).must_equal(
11
+ 'date1904' => [[Date.parse('2014-05-01')]]
12
+ )
11
13
  end
12
-
13
14
  end
@@ -1,19 +1,26 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
 
3
5
  describe SimpleXlsxReader do
4
- let(:datetimes_file) { File.join(File.dirname(__FILE__),
5
- 'datetimes.xlsx') }
6
+ let(:datetimes_file) do
7
+ File.join(
8
+ File.dirname(__FILE__),
9
+ 'datetimes.xlsx'
10
+ )
11
+ end
6
12
 
7
13
  let(:subject) { SimpleXlsxReader::Document.new(datetimes_file) }
8
14
 
9
15
  it 'converts date_times with the correct precision' do
10
- subject.to_hash.must_equal({
11
- "Datetimes" =>
12
- [[Time.parse("2013-08-19 18:29:59 UTC")],
13
- [Time.parse("2013-08-19 18:30:00 UTC")],
14
- [Time.parse("2013-08-19 18:30:01 UTC")],
15
- [Time.parse("1899-12-30 00:30:00 UTC")]]
16
- })
16
+ _(subject.to_hash).must_equal(
17
+ 'Datetimes' =>
18
+ [
19
+ [Time.parse('2013-08-19 18:29:59 UTC')],
20
+ [Time.parse('2013-08-19 18:30:00 UTC')],
21
+ [Time.parse('2013-08-19 18:30:01 UTC')],
22
+ [Time.parse('1899-12-30 00:30:00 UTC')]
23
+ ]
24
+ )
17
25
  end
18
-
19
26
  end
Binary file
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'test_helper'
4
+ require 'time'
5
+
6
+ describe SimpleXlsxReader do
7
+ let(:one_sheet_file) { File.join(File.dirname(__FILE__), 'gdocs_sheet.xlsx') }
8
+ let(:subject) { SimpleXlsxReader::Document.new(one_sheet_file) }
9
+
10
+ it 'able to load file from google docs' do
11
+ _(subject.to_hash).must_equal(
12
+ 'List 1' => [['Empty gdocs list 1']],
13
+ 'List 2' => [['Empty gdocs list 2']]
14
+ )
15
+ end
16
+ end
@@ -1,15 +1,20 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
 
3
5
  describe SimpleXlsxReader do
4
- let(:lower_case_shared_strings) { File.join(File.dirname(__FILE__),
5
- 'lower_case_sharedstrings.xlsx') }
6
+ let(:lower_case_shared_strings) do
7
+ File.join(
8
+ File.dirname(__FILE__),
9
+ 'lower_case_sharedstrings.xlsx'
10
+ )
11
+ end
6
12
 
7
13
  let(:subject) { SimpleXlsxReader::Document.new(lower_case_shared_strings) }
8
14
 
9
-
10
15
  describe '#to_hash' do
11
16
  it 'should have the word Well in the first row' do
12
- subject.sheets.first.rows[0].must_include('Well')
17
+ _(subject.sheets.first.rows.to_a[0]).must_include('Well')
13
18
  end
14
19
  end
15
20
  end