simple_xlsx_reader 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,18 @@
1
- require "simple_xlsx_reader/version"
1
+ # frozen_string_literal: true
2
+
2
3
  require 'nokogiri'
3
4
  require 'date'
4
5
 
6
+ require 'simple_xlsx_reader/version'
7
+ require 'simple_xlsx_reader/hyperlink'
8
+ require 'simple_xlsx_reader/document'
9
+ require 'simple_xlsx_reader/loader'
10
+ require 'simple_xlsx_reader/loader/workbook_parser'
11
+ require 'simple_xlsx_reader/loader/shared_strings_parser'
12
+ require 'simple_xlsx_reader/loader/sheet_parser'
13
+ require 'simple_xlsx_reader/loader/style_types_parser'
14
+
15
+
5
16
  # Rubyzip 1.0 only has different naming, everything else is the same, so let's
6
17
  # be flexible so we don't force people into a dependency hell w/ other gems.
7
18
  begin
@@ -17,452 +28,22 @@ rescue LoadError
17
28
  end
18
29
 
19
30
  module SimpleXlsxReader
20
- class CellLoadError < StandardError; end
21
-
22
- def self.configuration
23
- @configuration ||= Struct.new(:catch_cell_load_errors).new.tap do |c|
24
- c.catch_cell_load_errors = false
25
- end
26
- end
27
-
28
- def self.open(file_path)
29
- Document.new(file_path).tap(&:sheets)
30
- end
31
-
32
- class Document
33
- attr_reader :file_path
34
-
35
- def initialize(file_path)
36
- @file_path = file_path
37
- end
38
-
39
- def sheets
40
- @sheets ||= Mapper.new(xml).load_sheets
41
- end
31
+ DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
32
+ DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
42
33
 
43
- def to_hash
44
- sheets.inject({}) {|acc, sheet| acc[sheet.name] = sheet.rows; acc}
45
- end
46
-
47
- def xml
48
- Xml.load(file_path)
49
- end
50
-
51
- class Sheet < Struct.new(:name, :rows)
52
- def headers
53
- rows[0]
54
- end
55
-
56
- def data
57
- rows[1..-1]
58
- end
34
+ class CellLoadError < StandardError; end
59
35
 
60
- # Load errors will be a hash of the form:
61
- # {
62
- # [rownum, colnum] => '[error]'
63
- # }
64
- def load_errors
65
- @load_errors ||= {}
36
+ class << self
37
+ def configuration
38
+ @configuration ||= Struct.new(:catch_cell_load_errors, :auto_slurp).new.tap do |c|
39
+ c.catch_cell_load_errors = false
40
+ c.auto_slurp = false
66
41
  end
67
42
  end
68
43
 
69
- ##
70
- # For internal use; stores source xml in nokogiri documents
71
- class Xml
72
- attr_accessor :workbook, :shared_strings, :sheets, :styles
73
-
74
- def self.load(file_path)
75
- self.new.tap do |xml|
76
- SimpleXlsxReader::Zip.open(file_path) do |zip|
77
- xml.workbook = Nokogiri::XML(zip.read('xl/workbook.xml')).remove_namespaces!
78
- xml.styles = Nokogiri::XML(zip.read('xl/styles.xml')).remove_namespaces!
79
-
80
- # optional feature used by excel, but not often used by xlsx
81
- # generation libraries
82
- ss_file = (zip.to_a.map(&:name) & ['xl/sharedStrings.xml','xl/sharedstrings.xml'])[0]
83
- if ss_file
84
- xml.shared_strings = Nokogiri::XML(zip.read(ss_file)).remove_namespaces!
85
- end
86
-
87
- xml.sheets = []
88
- i = 0
89
- loop do
90
- i += 1
91
- break if !zip.file.file?("xl/worksheets/sheet#{i}.xml")
92
-
93
- xml.sheets <<
94
- Nokogiri::XML(zip.read("xl/worksheets/sheet#{i}.xml")).remove_namespaces!
95
- end
96
- end
97
- end
98
- end
44
+ def open(file_path)
45
+ Document.new(file_path).tap(&:sheets)
99
46
  end
100
-
101
- ##
102
- # For internal use; translates source xml to Sheet objects.
103
- class Mapper < Struct.new(:xml)
104
- DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
105
- DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
106
-
107
- def load_sheets
108
- sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
109
- parse_sheet(sheet_name, xml.sheets[i]) # sheet_number is *not* the index into xml.sheets
110
- end
111
- end
112
-
113
- # Table of contents for the sheets, ex. {'Authors' => 0, ...}
114
- def sheet_toc
115
- xml.workbook.xpath('/workbook/sheets/sheet').
116
- inject({}) do |acc, sheet|
117
-
118
- acc[sheet.attributes['name'].value] =
119
- sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
120
-
121
- acc
122
- end
123
- end
124
-
125
- def parse_sheet(sheet_name, xsheet)
126
- sheet = Sheet.new(sheet_name)
127
- sheet_width, sheet_height = *sheet_dimensions(xsheet)
128
-
129
- sheet.rows = Array.new(sheet_height) { Array.new(sheet_width) }
130
- xsheet.xpath("/worksheet/sheetData/row/c").each do |xcell|
131
- column, row = *xcell.attr('r').match(/([A-Z]+)([0-9]+)/).captures
132
- col_idx = column_letter_to_number(column) - 1
133
- row_idx = row.to_i - 1
134
-
135
- type = xcell.attributes['t'] &&
136
- xcell.attributes['t'].value
137
- style = xcell.attributes['s'] &&
138
- style_types[xcell.attributes['s'].value.to_i]
139
-
140
- # This is the main performance bottleneck. Using just 'xcell.text'
141
- # would be ideal, and makes parsing super-fast. However, there's
142
- # other junk in the cell, formula references in particular,
143
- # so we really do have to look for specific value nodes.
144
- # Maybe there is a really clever way to use xcell.text and parse out
145
- # the correct value, but I can't think of one, or an alternative
146
- # strategy.
147
- #
148
- # And yes, this really is faster than using xcell.at_xpath(...),
149
- # by about 60%. Odd.
150
- xvalue = type == 'inlineStr' ?
151
- (xis = xcell.children.find {|c| c.name == 'is'}) && xis.children.find {|c| c.name == 't'} :
152
- xcell.children.find {|c| c.name == 'v'}
153
-
154
- cell = begin
155
- self.class.cast(xvalue && xvalue.text.strip, type, style,
156
- :shared_strings => shared_strings,
157
- :base_date => base_date)
158
- rescue => e
159
- if !SimpleXlsxReader.configuration.catch_cell_load_errors
160
- error = CellLoadError.new(
161
- "Row #{row_idx}, Col #{col_idx}: #{e.message}")
162
- error.set_backtrace(e.backtrace)
163
- raise error
164
- else
165
- sheet.load_errors[[row_idx, col_idx]] = e.message
166
-
167
- xcell.text.strip
168
- end
169
- end
170
-
171
- # This shouldn't be necessary, but just in case, we'll create
172
- # the row so we don't blow up. This means any null rows in between
173
- # will be null instead of [null, null, ...]
174
- sheet.rows[row_idx] ||= Array.new(sheet_width)
175
-
176
- sheet.rows[row_idx][col_idx] = cell
177
- end
178
-
179
- sheet
180
- end
181
-
182
- ##
183
- # Returns the last column name, ex. 'E'
184
- #
185
- # Note that excel writes a '/worksheet/dimension' node we can get the
186
- # last cell from, but some libs (ex. simple_xlsx_writer) don't record
187
- # this. In that case, we assume the data is of uniform column length
188
- # and check the column name of the last header row. Obviously this isn't
189
- # the most robust strategy, but it likely fits 99% of use cases
190
- # considering it's not a problem with actual excel docs.
191
- def last_cell_label(xsheet)
192
- dimension = xsheet.at_xpath('/worksheet/dimension')
193
- if dimension
194
- col = dimension.attributes['ref'].value.match(/:([A-Z]+[0-9]+)/)
195
- col ? col.captures.first : 'A1'
196
- else
197
- last = xsheet.at_xpath("/worksheet/sheetData/row[last()]/c[last()]")
198
- last ? last.attributes['r'].value.match(/([A-Z]+[0-9]+)/).captures.first : 'A1'
199
- end
200
- end
201
-
202
- # Returns dimensions (1-indexed)
203
- def sheet_dimensions(xsheet)
204
- column, row = *last_cell_label(xsheet).match(/([A-Z]+)([0-9]+)/).captures
205
- [column_letter_to_number(column), row.to_i]
206
- end
207
-
208
- # formula fits an exponential factorial function of the form:
209
- # 'A' = 1
210
- # 'B' = 2
211
- # 'Z' = 26
212
- # 'AA' = 26 * 1 + 1
213
- # 'AZ' = 26 * 1 + 26
214
- # 'BA' = 26 * 2 + 1
215
- # 'ZA' = 26 * 26 + 1
216
- # 'ZZ' = 26 * 26 + 26
217
- # 'AAA' = 26 * 26 * 1 + 26 * 1 + 1
218
- # 'AAZ' = 26 * 26 * 1 + 26 * 1 + 26
219
- # 'ABA' = 26 * 26 * 1 + 26 * 2 + 1
220
- # 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
221
- def column_letter_to_number(column_letter)
222
- pow = column_letter.length - 1
223
- result = 0
224
- column_letter.each_byte do |b|
225
- result += 26**pow * (b - 64)
226
- pow -= 1
227
- end
228
- result
229
- end
230
-
231
- # Excel doesn't record types for some cells, only its display style, so
232
- # we have to back out the type from that style.
233
- #
234
- # Some of these styles can be determined from a known set (see NumFmtMap),
235
- # while others are 'custom' and we have to make a best guess.
236
- #
237
- # This is the array of types corresponding to the styles a spreadsheet
238
- # uses, and includes both the known style types and the custom styles.
239
- #
240
- # Note that the xml sheet cells that use this don't reference the
241
- # numFmtId, but instead the array index of a style in the stored list of
242
- # only the styles used in the spreadsheet (which can be either known or
243
- # custom). Hence this style types array, rather than a map of numFmtId to
244
- # type.
245
- def style_types
246
- @style_types ||=
247
- xml.styles.xpath('/styleSheet/cellXfs/xf').map {|xstyle|
248
- style_type_by_num_fmt_id(num_fmt_id(xstyle))}
249
- end
250
-
251
- #returns the numFmtId value if it's available
252
- def num_fmt_id(xstyle)
253
- if xstyle.attributes['numFmtId']
254
- xstyle.attributes['numFmtId'].value
255
- else
256
- nil
257
- end
258
- end
259
-
260
- # Finds the type we think a style is; For example, fmtId 14 is a date
261
- # style, so this would return :date.
262
- #
263
- # Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
264
- # but in practice can sometimes be simply out of the usual "Any Language"
265
- # id range that goes up to 49. For example, I have seen a numFmtId of
266
- # 59 specified as a date. In Thai, 59 is a number format, so this seems
267
- # like a bad idea, but we try to be flexible and just go with it.
268
- def style_type_by_num_fmt_id(id)
269
- return nil if id.nil?
270
-
271
- id = id.to_i
272
- NumFmtMap[id] || custom_style_types[id]
273
- end
274
-
275
- # Map of (numFmtId >= 164) (custom styles) to our best guess at the type
276
- # ex. {164 => :date_time}
277
- def custom_style_types
278
- @custom_style_types ||=
279
- xml.styles.xpath('/styleSheet/numFmts/numFmt').
280
- inject({}) do |acc, xstyle|
281
-
282
- acc[xstyle.attributes['numFmtId'].value.to_i] =
283
- determine_custom_style_type(xstyle.attributes['formatCode'].value)
284
-
285
- acc
286
- end
287
- end
288
-
289
- # This is the least deterministic part of reading xlsx files. Due to
290
- # custom styles, you can't know for sure when a date is a date other than
291
- # looking at its format and gessing. It's not impossible to guess right,
292
- # though.
293
- #
294
- # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
295
- def determine_custom_style_type(string)
296
- return :float if string[0] == '_'
297
- return :float if string[0] == ' 0'
298
-
299
- # Looks for one of ymdhis outside of meta-stuff like [Red]
300
- return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
301
-
302
- return :unsupported
303
- end
304
-
305
- ##
306
- # The heart of typecasting. The ruby type is determined either explicitly
307
- # from the cell xml or implicitly from the cell style, and this
308
- # method expects that work to have been done already. This, then,
309
- # takes the type we determined it to be and casts the cell value
310
- # to that type.
311
- #
312
- # types:
313
- # - s: shared string (see #shared_string)
314
- # - n: number (cast to a float)
315
- # - b: boolean
316
- # - str: string
317
- # - inlineStr: string
318
- # - ruby symbol: for when type has been determined by style
319
- #
320
- # options:
321
- # - shared_strings: needed for 's' (shared string) type
322
- def self.cast(value, type, style, options = {})
323
- return nil if value.nil? || value.empty?
324
-
325
- # Sometimes the type is dictated by the style alone
326
- if type.nil? ||
327
- (type == 'n' && [:date, :time, :date_time].include?(style))
328
- type = style
329
- end
330
-
331
- case type
332
-
333
- ##
334
- # There are few built-in types
335
- ##
336
-
337
- when 's' # shared string
338
- options[:shared_strings][value.to_i]
339
- when 'n' # number
340
- value.to_f
341
- when 'b'
342
- value.to_i == 1
343
- when 'str'
344
- value
345
- when 'inlineStr'
346
- value
347
-
348
- ##
349
- # Type can also be determined by a style,
350
- # detected earlier and cast here by its standardized symbol
351
- ##
352
-
353
- when :string, :unsupported
354
- value
355
- when :fixnum
356
- value.to_i
357
- when :float
358
- value.to_f
359
- when :percentage
360
- value.to_f / 100
361
- # the trickiest. note that all these formats can vary on
362
- # whether they actually contain a date, time, or datetime.
363
- when :date, :time, :date_time
364
- value = value.to_f
365
- days_since_date_system_start = value.to_i
366
- fraction_of_24 = value - days_since_date_system_start
367
-
368
- # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
369
- date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
370
-
371
- if fraction_of_24 > 0 # there is a time associated
372
- seconds = (fraction_of_24 * 86400).round
373
- return Time.utc(date.year, date.month, date.day) + seconds
374
- else
375
- return date
376
- end
377
- when :bignum
378
- if defined?(BigDecimal)
379
- BigDecimal.new(value)
380
- else
381
- value.to_f
382
- end
383
-
384
- ##
385
- # Beats me
386
- ##
387
-
388
- else
389
- value
390
- end
391
- end
392
-
393
- ## Returns the base_date from which to calculate dates.
394
- # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
395
- # it's set in the Workbook's workbookPr.
396
- # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
397
- def base_date
398
- @base_date ||=
399
- begin
400
- return DATE_SYSTEM_1900 if xml.workbook == nil
401
- xml.workbook.xpath("//workbook/workbookPr[@date1904]").each do |workbookPr|
402
- return DATE_SYSTEM_1904 if workbookPr["date1904"] =~ /true|1/i
403
- end
404
- DATE_SYSTEM_1900
405
- end
406
- end
407
-
408
- # Map of non-custom numFmtId to casting symbol
409
- NumFmtMap = {
410
- 0 => :string, # General
411
- 1 => :fixnum, # 0
412
- 2 => :float, # 0.00
413
- 3 => :fixnum, # #,##0
414
- 4 => :float, # #,##0.00
415
- 5 => :unsupported, # $#,##0_);($#,##0)
416
- 6 => :unsupported, # $#,##0_);[Red]($#,##0)
417
- 7 => :unsupported, # $#,##0.00_);($#,##0.00)
418
- 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
419
- 9 => :percentage, # 0%
420
- 10 => :percentage, # 0.00%
421
- 11 => :bignum, # 0.00E+00
422
- 12 => :unsupported, # # ?/?
423
- 13 => :unsupported, # # ??/??
424
- 14 => :date, # mm-dd-yy
425
- 15 => :date, # d-mmm-yy
426
- 16 => :date, # d-mmm
427
- 17 => :date, # mmm-yy
428
- 18 => :time, # h:mm AM/PM
429
- 19 => :time, # h:mm:ss AM/PM
430
- 20 => :time, # h:mm
431
- 21 => :time, # h:mm:ss
432
- 22 => :date_time, # m/d/yy h:mm
433
- 37 => :unsupported, # #,##0 ;(#,##0)
434
- 38 => :unsupported, # #,##0 ;[Red](#,##0)
435
- 39 => :unsupported, # #,##0.00;(#,##0.00)
436
- 40 => :unsupported, # #,##0.00;[Red](#,##0.00)
437
- 45 => :time, # mm:ss
438
- 46 => :time, # [h]:mm:ss
439
- 47 => :time, # mmss.0
440
- 48 => :bignum, # ##0.0E+0
441
- 49 => :unsupported # @
442
- }
443
-
444
- # For performance reasons, excel uses an optional SpreadsheetML feature
445
- # that puts all strings in a separate xml file, and then references
446
- # them by their index in that file.
447
- #
448
- # http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
449
- def shared_strings
450
- @shared_strings ||= begin
451
- if xml.shared_strings
452
- xml.shared_strings.xpath('/sst/si').map do |xsst|
453
- # a shared string can be a single value...
454
- sst = xsst.at_xpath('t/text()')
455
- sst = sst.text if sst
456
- # ... or a composite of seperately styled words/characters
457
- sst ||= xsst.xpath('r/t/text()').map(&:text).join
458
- end
459
- else
460
- []
461
- end
462
- end
463
- end
464
-
465
- end
466
-
47
+ alias parse open
467
48
  end
468
49
  end
@@ -7,19 +7,21 @@ Gem::Specification.new do |gem|
7
7
  gem.name = "simple_xlsx_reader"
8
8
  gem.version = SimpleXlsxReader::VERSION
9
9
  gem.authors = ["Woody Peterson"]
10
- gem.email = ["woody@sigby.com"]
10
+ gem.email = ["woody.peterson@gmail.com"]
11
11
  gem.description = %q{Read xlsx data the Ruby way}
12
12
  gem.summary = %q{Read xlsx data the Ruby way}
13
13
  gem.homepage = ""
14
+ gem.license = "MIT"
14
15
 
15
16
  gem.add_dependency 'nokogiri'
16
17
  gem.add_dependency 'rubyzip'
17
18
 
18
19
  gem.add_development_dependency 'minitest', '>= 5.0'
20
+ gem.add_development_dependency 'rake'
19
21
  gem.add_development_dependency 'pry'
20
22
 
21
23
  gem.files = `git ls-files`.split($/)
22
24
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
23
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
25
+ gem.test_files = gem.files.grep(%r{^test/})
24
26
  gem.require_paths = ["lib"]
25
27
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
 
3
5
  describe SimpleXlsxReader do
@@ -5,9 +7,8 @@ describe SimpleXlsxReader do
5
7
  let(:subject) { SimpleXlsxReader::Document.new(date1904_file) }
6
8
 
7
9
  it 'supports converting dates with the 1904 date system' do
8
- subject.to_hash.must_equal({
9
- "date1904" => [[Date.parse("2014-05-01")]]
10
- })
10
+ _(subject.to_hash).must_equal(
11
+ 'date1904' => [[Date.parse('2014-05-01')]]
12
+ )
11
13
  end
12
-
13
14
  end
@@ -1,19 +1,26 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
 
3
5
  describe SimpleXlsxReader do
4
- let(:datetimes_file) { File.join(File.dirname(__FILE__),
5
- 'datetimes.xlsx') }
6
+ let(:datetimes_file) do
7
+ File.join(
8
+ File.dirname(__FILE__),
9
+ 'datetimes.xlsx'
10
+ )
11
+ end
6
12
 
7
13
  let(:subject) { SimpleXlsxReader::Document.new(datetimes_file) }
8
14
 
9
15
  it 'converts date_times with the correct precision' do
10
- subject.to_hash.must_equal({
11
- "Datetimes" =>
12
- [[Time.parse("2013-08-19 18:29:59 UTC")],
13
- [Time.parse("2013-08-19 18:30:00 UTC")],
14
- [Time.parse("2013-08-19 18:30:01 UTC")],
15
- [Time.parse("1899-12-30 00:30:00 UTC")]]
16
- })
16
+ _(subject.to_hash).must_equal(
17
+ 'Datetimes' =>
18
+ [
19
+ [Time.parse('2013-08-19 18:29:59 UTC')],
20
+ [Time.parse('2013-08-19 18:30:00 UTC')],
21
+ [Time.parse('2013-08-19 18:30:01 UTC')],
22
+ [Time.parse('1899-12-30 00:30:00 UTC')]
23
+ ]
24
+ )
17
25
  end
18
-
19
26
  end
Binary file
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'test_helper'
4
+ require 'time'
5
+
6
+ describe SimpleXlsxReader do
7
+ let(:one_sheet_file) { File.join(File.dirname(__FILE__), 'gdocs_sheet.xlsx') }
8
+ let(:subject) { SimpleXlsxReader::Document.new(one_sheet_file) }
9
+
10
+ it 'able to load file from google docs' do
11
+ _(subject.to_hash).must_equal(
12
+ 'List 1' => [['Empty gdocs list 1']],
13
+ 'List 2' => [['Empty gdocs list 2']]
14
+ )
15
+ end
16
+ end
@@ -1,15 +1,20 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
 
3
5
  describe SimpleXlsxReader do
4
- let(:lower_case_shared_strings) { File.join(File.dirname(__FILE__),
5
- 'lower_case_sharedstrings.xlsx') }
6
+ let(:lower_case_shared_strings) do
7
+ File.join(
8
+ File.dirname(__FILE__),
9
+ 'lower_case_sharedstrings.xlsx'
10
+ )
11
+ end
6
12
 
7
13
  let(:subject) { SimpleXlsxReader::Document.new(lower_case_shared_strings) }
8
14
 
9
-
10
15
  describe '#to_hash' do
11
16
  it 'should have the word Well in the first row' do
12
- subject.sheets.first.rows[0].must_include('Well')
17
+ _(subject.sheets.first.rows.to_a[0]).must_include('Well')
13
18
  end
14
19
  end
15
20
  end