RubyGems - simple_xlsx_reader - Versions diffs - 1.0.2 → 2.0.0 - Mend

simple_xlsx_reader 1.0.2 → 2.0.0

Files changed (28) hide show

checksums.yaml +5 -5
data/.github/workflows/ruby.yml +38 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +22 -0
data/README.md +190 -57
data/Rakefile +3 -1
data/lib/simple_xlsx_reader/document.rb +147 -0
data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
data/lib/simple_xlsx_reader/loader.rb +199 -0
data/lib/simple_xlsx_reader/version.rb +3 -1
data/lib/simple_xlsx_reader.rb +23 -442
data/simple_xlsx_reader.gemspec +4 -2
data/test/date1904_test.rb +5 -4
data/test/datetime_test.rb +17 -10
data/test/gdocs_sheet.xlsx +0 -0
data/test/gdocs_sheet_test.rb +16 -0
data/test/lower_case_sharedstrings_test.rb +9 -4
data/test/performance_test.rb +86 -89
data/test/sesame_street_blog.xlsx +0 -0
data/test/shared_strings.xml +4 -0
data/test/simple_xlsx_reader_test.rb +835 -320
data/test/test_helper.rb +4 -1
data/test/test_xlsx_builder.rb +104 -0
metadata +38 -9

data/lib/simple_xlsx_reader.rb CHANGED Viewed

@@ -1,7 +1,18 @@
-require "simple_xlsx_reader/version"
+# frozen_string_literal: true
 require 'nokogiri'
 require 'date'
+require 'simple_xlsx_reader/version'
+require 'simple_xlsx_reader/hyperlink'
+require 'simple_xlsx_reader/document'
+require 'simple_xlsx_reader/loader'
+require 'simple_xlsx_reader/loader/workbook_parser'
+require 'simple_xlsx_reader/loader/shared_strings_parser'
+require 'simple_xlsx_reader/loader/sheet_parser'
+require 'simple_xlsx_reader/loader/style_types_parser'
 # Rubyzip 1.0 only has different naming, everything else is the same, so let's
 # be flexible so we don't force people into a dependency hell w/ other gems.
 begin
@@ -17,452 +28,22 @@ rescue LoadError
 end
 module SimpleXlsxReader
-  class CellLoadError < StandardError; end
-  def self.configuration
-    @configuration ||= Struct.new(:catch_cell_load_errors).new.tap do |c|
-      c.catch_cell_load_errors = false
-    end
-  end
-  def self.open(file_path)
-    Document.new(file_path).tap(&:sheets)
-  end
-  class Document
-    attr_reader :file_path
-    def initialize(file_path)
-      @file_path = file_path
-    end
-    def sheets
-      @sheets ||= Mapper.new(xml).load_sheets
-    end
+  DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
+  DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
-    def to_hash
-      sheets.inject({}) {|acc, sheet| acc[sheet.name] = sheet.rows; acc}
-    end
-    def xml
-      Xml.load(file_path)
-    end
-    class Sheet < Struct.new(:name, :rows)
-      def headers
-        rows[0]
-      end
-      def data
-        rows[1..-1]
-      end
+  class CellLoadError < StandardError; end
-      # Load errors will be a hash of the form:
-      # {
-      #   [rownum, colnum] => '[error]'
-      # }
-      def load_errors
-        @load_errors ||= {}
+  class << self
+    def configuration
+      @configuration ||= Struct.new(:catch_cell_load_errors, :auto_slurp).new.tap do |c|
+        c.catch_cell_load_errors = false
+        c.auto_slurp = false
       end
     end
-    ##
-    # For internal use; stores source xml in nokogiri documents
-    class Xml
-      attr_accessor :workbook, :shared_strings, :sheets, :styles
-      def self.load(file_path)
-        self.new.tap do |xml|
-          SimpleXlsxReader::Zip.open(file_path) do |zip|
-            xml.workbook = Nokogiri::XML(zip.read('xl/workbook.xml')).remove_namespaces!
-            xml.styles   = Nokogiri::XML(zip.read('xl/styles.xml')).remove_namespaces!
-            # optional feature used by excel, but not often used by xlsx
-            # generation libraries
-            ss_file =  (zip.to_a.map(&:name) & ['xl/sharedStrings.xml','xl/sharedstrings.xml'])[0]
-            if ss_file
-              xml.shared_strings = Nokogiri::XML(zip.read(ss_file)).remove_namespaces!
-            end
-            xml.sheets = []
-            i = 0
-            loop do
-              i += 1
-              break if !zip.file.file?("xl/worksheets/sheet#{i}.xml")
-              xml.sheets <<
-                Nokogiri::XML(zip.read("xl/worksheets/sheet#{i}.xml")).remove_namespaces!
-            end
-          end
-        end
-      end
+    def open(file_path)
+      Document.new(file_path).tap(&:sheets)
     end
-    ##
-    # For internal use; translates source xml to Sheet objects.
-    class Mapper < Struct.new(:xml)
-      DATE_SYSTEM_1900 = Date.new(1899, 12, 30)
-      DATE_SYSTEM_1904 = Date.new(1904, 1, 1)
-      def load_sheets
-        sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
-          parse_sheet(sheet_name, xml.sheets[i])  # sheet_number is *not* the index into xml.sheets
-        end
-      end
-      # Table of contents for the sheets, ex. {'Authors' => 0, ...}
-      def sheet_toc
-        xml.workbook.xpath('/workbook/sheets/sheet').
-          inject({}) do |acc, sheet|
-          acc[sheet.attributes['name'].value] =
-            sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
-          acc
-        end
-      end
-      def parse_sheet(sheet_name, xsheet)
-        sheet = Sheet.new(sheet_name)
-        sheet_width, sheet_height = *sheet_dimensions(xsheet)
-        sheet.rows = Array.new(sheet_height) { Array.new(sheet_width) }
-        xsheet.xpath("/worksheet/sheetData/row/c").each do |xcell|
-          column, row = *xcell.attr('r').match(/([A-Z]+)([0-9]+)/).captures
-          col_idx = column_letter_to_number(column) - 1
-          row_idx = row.to_i - 1
-          type  = xcell.attributes['t'] &&
-                  xcell.attributes['t'].value
-          style = xcell.attributes['s'] &&
-                  style_types[xcell.attributes['s'].value.to_i]
-          # This is the main performance bottleneck. Using just 'xcell.text'
-          # would be ideal, and makes parsing super-fast. However, there's
-          # other junk in the cell, formula references in particular,
-          # so we really do have to look for specific value nodes.
-          # Maybe there is a really clever way to use xcell.text and parse out
-          # the correct value, but I can't think of one, or an alternative
-          # strategy.
-          #
-          # And yes, this really is faster than using xcell.at_xpath(...),
-          # by about 60%. Odd.
-          xvalue = type == 'inlineStr' ?
-            (xis = xcell.children.find {|c| c.name == 'is'}) && xis.children.find {|c| c.name == 't'} :
-            xcell.children.find {|c| c.name == 'v'}
-          cell = begin
-            self.class.cast(xvalue && xvalue.text.strip, type, style,
-                            :shared_strings => shared_strings,
-                            :base_date => base_date)
-          rescue => e
-            if !SimpleXlsxReader.configuration.catch_cell_load_errors
-              error = CellLoadError.new(
-                "Row #{row_idx}, Col #{col_idx}: #{e.message}")
-              error.set_backtrace(e.backtrace)
-              raise error
-            else
-              sheet.load_errors[[row_idx, col_idx]] = e.message
-              xcell.text.strip
-            end
-          end
-          # This shouldn't be necessary, but just in case, we'll create
-          # the row so we don't blow up. This means any null rows in between
-          # will be null instead of [null, null, ...]
-          sheet.rows[row_idx] ||= Array.new(sheet_width)
-          sheet.rows[row_idx][col_idx] = cell
-        end
-        sheet
-      end
-      ##
-      # Returns the last column name, ex. 'E'
-      #
-      # Note that excel writes a '/worksheet/dimension' node we can get the
-      # last cell from, but some libs (ex. simple_xlsx_writer) don't record
-      # this. In that case, we assume the data is of uniform column length
-      # and check the column name of the last header row. Obviously this isn't
-      # the most robust strategy, but it likely fits 99% of use cases
-      # considering it's not a problem with actual excel docs.
-      def last_cell_label(xsheet)
-        dimension = xsheet.at_xpath('/worksheet/dimension')
-        if dimension
-          col = dimension.attributes['ref'].value.match(/:([A-Z]+[0-9]+)/)
-          col ? col.captures.first : 'A1'
-        else
-          last = xsheet.at_xpath("/worksheet/sheetData/row[last()]/c[last()]")
-          last ? last.attributes['r'].value.match(/([A-Z]+[0-9]+)/).captures.first : 'A1'
-        end
-      end
-      # Returns dimensions (1-indexed)
-      def sheet_dimensions(xsheet)
-        column, row = *last_cell_label(xsheet).match(/([A-Z]+)([0-9]+)/).captures
-        [column_letter_to_number(column), row.to_i]
-      end
-      # formula fits an exponential factorial function of the form:
-      # 'A'   = 1
-      # 'B'   = 2
-      # 'Z'   = 26
-      # 'AA'  = 26 * 1  + 1
-      # 'AZ'  = 26 * 1  + 26
-      # 'BA'  = 26 * 2  + 1
-      # 'ZA'  = 26 * 26 + 1
-      # 'ZZ'  = 26 * 26 + 26
-      # 'AAA' = 26 * 26 * 1 + 26 * 1  + 1
-      # 'AAZ' = 26 * 26 * 1 + 26 * 1  + 26
-      # 'ABA' = 26 * 26 * 1 + 26 * 2  + 1
-      # 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
-      def column_letter_to_number(column_letter)
-        pow = column_letter.length - 1
-        result = 0
-        column_letter.each_byte do |b|
-          result += 26**pow * (b - 64)
-          pow -= 1
-        end
-        result
-      end
-      # Excel doesn't record types for some cells, only its display style, so
-      # we have to back out the type from that style.
-      #
-      # Some of these styles can be determined from a known set (see NumFmtMap),
-      # while others are 'custom' and we have to make a best guess.
-      #
-      # This is the array of types corresponding to the styles a spreadsheet
-      # uses, and includes both the known style types and the custom styles.
-      #
-      # Note that the xml sheet cells that use this don't reference the
-      # numFmtId, but instead the array index of a style in the stored list of
-      # only the styles used in the spreadsheet (which can be either known or
-      # custom). Hence this style types array, rather than a map of numFmtId to
-      # type.
-      def style_types
-        @style_types ||=
-            xml.styles.xpath('/styleSheet/cellXfs/xf').map {|xstyle|
-              style_type_by_num_fmt_id(num_fmt_id(xstyle))}
-      end
-      #returns the numFmtId value if it's available
-      def num_fmt_id(xstyle)
-        if xstyle.attributes['numFmtId']
-          xstyle.attributes['numFmtId'].value
-        else
-          nil
-        end
-      end
-      # Finds the type we think a style is; For example, fmtId 14 is a date
-      # style, so this would return :date.
-      #
-      # Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
-      # but in practice can sometimes be simply out of the usual "Any Language"
-      # id range that goes up to 49. For example, I have seen a numFmtId of
-      # 59 specified as a date. In Thai, 59 is a number format, so this seems
-      # like a bad idea, but we try to be flexible and just go with it.
-      def style_type_by_num_fmt_id(id)
-        return nil if id.nil?
-        id = id.to_i
-        NumFmtMap[id] || custom_style_types[id]
-      end
-      # Map of (numFmtId >= 164) (custom styles) to our best guess at the type
-      # ex. {164 => :date_time}
-      def custom_style_types
-        @custom_style_types ||=
-          xml.styles.xpath('/styleSheet/numFmts/numFmt').
-          inject({}) do |acc, xstyle|
-          acc[xstyle.attributes['numFmtId'].value.to_i] =
-            determine_custom_style_type(xstyle.attributes['formatCode'].value)
-          acc
-        end
-      end
-      # This is the least deterministic part of reading xlsx files. Due to
-      # custom styles, you can't know for sure when a date is a date other than
-      # looking at its format and gessing. It's not impossible to guess right,
-      # though.
-      #
-      # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
-      def determine_custom_style_type(string)
-        return :float if string[0] == '_'
-        return :float if string[0] == ' 0'
-        # Looks for one of ymdhis outside of meta-stuff like [Red]
-        return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
-        return :unsupported
-      end
-      ##
-      # The heart of typecasting. The ruby type is determined either explicitly
-      # from the cell xml or implicitly from the cell style, and this
-      # method expects that work to have been done already. This, then,
-      # takes the type we determined it to be and casts the cell value
-      # to that type.
-      #
-      # types:
-      # - s: shared string (see #shared_string)
-      # - n: number (cast to a float)
-      # - b: boolean
-      # - str: string
-      # - inlineStr: string
-      # - ruby symbol: for when type has been determined by style
-      #
-      # options:
-      # - shared_strings: needed for 's' (shared string) type
-      def self.cast(value, type, style, options = {})
-        return nil if value.nil? || value.empty?
-        # Sometimes the type is dictated by the style alone
-        if type.nil? ||
-          (type == 'n' && [:date, :time, :date_time].include?(style))
-          type = style
-        end
-        case type
-        ##
-        # There are few built-in types
-        ##
-        when 's' # shared string
-          options[:shared_strings][value.to_i]
-        when 'n' # number
-          value.to_f
-        when 'b'
-          value.to_i == 1
-        when 'str'
-          value
-        when 'inlineStr'
-          value
-        ##
-        # Type can also be determined by a style,
-        # detected earlier and cast here by its standardized symbol
-        ##
-        when :string, :unsupported
-          value
-        when :fixnum
-          value.to_i
-        when :float
-          value.to_f
-        when :percentage
-          value.to_f / 100
-        # the trickiest. note that  all these formats can vary on
-        # whether they actually contain a date, time, or datetime.
-        when :date, :time, :date_time
-          value = value.to_f
-          days_since_date_system_start = value.to_i
-          fraction_of_24 = value - days_since_date_system_start
-          # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
-          date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
-          if fraction_of_24 > 0 # there is a time associated
-            seconds = (fraction_of_24 * 86400).round
-            return Time.utc(date.year, date.month, date.day) + seconds
-          else
-            return date
-          end
-        when :bignum
-          if defined?(BigDecimal)
-            BigDecimal.new(value)
-          else
-            value.to_f
-          end
-        ##
-        # Beats me
-        ##
-        else
-          value
-        end
-      end
-      ## Returns the base_date from which to calculate dates.
-      # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
-      # it's set in the Workbook's workbookPr.
-      # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
-      def base_date
-        @base_date ||=
-          begin
-            return DATE_SYSTEM_1900 if xml.workbook == nil
-            xml.workbook.xpath("//workbook/workbookPr[@date1904]").each do |workbookPr|
-              return DATE_SYSTEM_1904 if workbookPr["date1904"] =~ /true|1/i
-            end
-            DATE_SYSTEM_1900
-          end
-      end
-      # Map of non-custom numFmtId to casting symbol
-      NumFmtMap = {
-        0  => :string,         # General
-        1  => :fixnum,         # 0
-        2  => :float,          # 0.00
-        3  => :fixnum,         # #,##0
-        4  => :float,          # #,##0.00
-        5  => :unsupported,    # $#,##0_);($#,##0)
-        6  => :unsupported,    # $#,##0_);[Red]($#,##0)
-        7  => :unsupported,    # $#,##0.00_);($#,##0.00)
-        8  => :unsupported,    # $#,##0.00_);[Red]($#,##0.00)
-        9  => :percentage,     # 0%
-        10 => :percentage,     # 0.00%
-        11 => :bignum,         # 0.00E+00
-        12 => :unsupported,    # # ?/?
-        13 => :unsupported,    # # ??/??
-        14 => :date,           # mm-dd-yy
-        15 => :date,           # d-mmm-yy
-        16 => :date,           # d-mmm
-        17 => :date,           # mmm-yy
-        18 => :time,           # h:mm AM/PM
-        19 => :time,           # h:mm:ss AM/PM
-        20 => :time,           # h:mm
-        21 => :time,           # h:mm:ss
-        22 => :date_time,      # m/d/yy h:mm
-        37 => :unsupported,    # #,##0 ;(#,##0)
-        38 => :unsupported,    # #,##0 ;[Red](#,##0)
-        39 => :unsupported,    # #,##0.00;(#,##0.00)
-        40 => :unsupported,    # #,##0.00;[Red](#,##0.00)
-        45 => :time,           # mm:ss
-        46 => :time,           # [h]:mm:ss
-        47 => :time,           # mmss.0
-        48 => :bignum,         # ##0.0E+0
-        49 => :unsupported     # @
-      }
-      # For performance reasons, excel uses an optional SpreadsheetML feature
-      # that puts all strings in a separate xml file, and then references
-      # them by their index in that file.
-      #
-      # http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
-      def shared_strings
-        @shared_strings ||= begin
-          if xml.shared_strings
-            xml.shared_strings.xpath('/sst/si').map do |xsst|
-              # a shared string can be a single value...
-              sst = xsst.at_xpath('t/text()')
-              sst = sst.text if sst
-              # ... or a composite of seperately styled words/characters
-              sst ||= xsst.xpath('r/t/text()').map(&:text).join
-            end
-          else
-            []
-          end
-        end
-      end
-    end
+    alias parse open
   end
 end

data/simple_xlsx_reader.gemspec CHANGED Viewed

@@ -7,19 +7,21 @@ Gem::Specification.new do |gem|
   gem.name          = "simple_xlsx_reader"
   gem.version       = SimpleXlsxReader::VERSION
   gem.authors       = ["Woody Peterson"]
-  gem.email         = ["woody@sigby.com"]
+  gem.email         = ["woody.peterson@gmail.com"]
   gem.description   = %q{Read xlsx data the Ruby way}
   gem.summary       = %q{Read xlsx data the Ruby way}
   gem.homepage      = ""
+  gem.license       = "MIT"
   gem.add_dependency 'nokogiri'
   gem.add_dependency 'rubyzip'
   gem.add_development_dependency 'minitest', '>= 5.0'
+  gem.add_development_dependency 'rake'
   gem.add_development_dependency 'pry'
   gem.files         = `git ls-files`.split($/)
   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
-  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.test_files    = gem.files.grep(%r{^test/})
   gem.require_paths = ["lib"]
 end

data/test/date1904_test.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require_relative 'test_helper'
 describe SimpleXlsxReader do
@@ -5,9 +7,8 @@ describe SimpleXlsxReader do
   let(:subject) { SimpleXlsxReader::Document.new(date1904_file) }
   it 'supports converting dates with the 1904 date system' do
-    subject.to_hash.must_equal({
-      "date1904" => [[Date.parse("2014-05-01")]]
-    })
+    _(subject.to_hash).must_equal(
+      'date1904' => [[Date.parse('2014-05-01')]]
+    )
   end
 end

data/test/datetime_test.rb CHANGED Viewed

@@ -1,19 +1,26 @@
+# frozen_string_literal: true
 require_relative 'test_helper'
 describe SimpleXlsxReader do
-  let(:datetimes_file) { File.join(File.dirname(__FILE__),
-                                   'datetimes.xlsx') }
+  let(:datetimes_file) do
+    File.join(
+      File.dirname(__FILE__),
+      'datetimes.xlsx'
+    )
+  end
   let(:subject) { SimpleXlsxReader::Document.new(datetimes_file) }
   it 'converts date_times with the correct precision' do
-    subject.to_hash.must_equal({
-      "Datetimes" =>
-        [[Time.parse("2013-08-19 18:29:59 UTC")],
-         [Time.parse("2013-08-19 18:30:00 UTC")],
-         [Time.parse("2013-08-19 18:30:01 UTC")],
-         [Time.parse("1899-12-30 00:30:00 UTC")]]
-    })
+    _(subject.to_hash).must_equal(
+      'Datetimes' =>
+        [
+          [Time.parse('2013-08-19 18:29:59 UTC')],
+          [Time.parse('2013-08-19 18:30:00 UTC')],
+          [Time.parse('2013-08-19 18:30:01 UTC')],
+          [Time.parse('1899-12-30 00:30:00 UTC')]
+        ]
+    )
   end
 end

data/test/gdocs_sheet.xlsx ADDED Viewed

Binary file

data/test/gdocs_sheet_test.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+require_relative 'test_helper'
+require 'time'
+describe SimpleXlsxReader do
+  let(:one_sheet_file) { File.join(File.dirname(__FILE__), 'gdocs_sheet.xlsx') }
+  let(:subject) { SimpleXlsxReader::Document.new(one_sheet_file) }
+  it 'able to load file from google docs' do
+    _(subject.to_hash).must_equal(
+      'List 1' => [['Empty gdocs list 1']],
+      'List 2' => [['Empty gdocs list 2']]
+    )
+  end
+end

data/test/lower_case_sharedstrings_test.rb CHANGED Viewed

@@ -1,15 +1,20 @@
+# frozen_string_literal: true
 require_relative 'test_helper'
 describe SimpleXlsxReader do
-  let(:lower_case_shared_strings) { File.join(File.dirname(__FILE__),
-                                                'lower_case_sharedstrings.xlsx') }
+  let(:lower_case_shared_strings) do
+    File.join(
+      File.dirname(__FILE__),
+      'lower_case_sharedstrings.xlsx'
+    )
+  end
   let(:subject) { SimpleXlsxReader::Document.new(lower_case_shared_strings) }
   describe '#to_hash' do
     it 'should have the word Well in the first row' do
-      subject.sheets.first.rows[0].must_include('Well')
+      _(subject.sheets.first.rows.to_a[0]).must_include('Well')
     end
   end
 end