RubyGems - simple_xlsx_reader - Versions diffs - 1.0.5 → 2.0.0 - Mend

simple_xlsx_reader 1.0.5 → 2.0.0

Files changed (24) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +38 -0
data/CHANGELOG.md +7 -0
data/README.md +190 -64
data/Rakefile +3 -1
data/lib/simple_xlsx_reader/document.rb +147 -0
data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
data/lib/simple_xlsx_reader/loader.rb +199 -0
data/lib/simple_xlsx_reader/version.rb +3 -1
data/lib/simple_xlsx_reader.rb +23 -519
data/test/date1904_test.rb +5 -4
data/test/datetime_test.rb +17 -10
data/test/gdocs_sheet_test.rb +6 -5
data/test/lower_case_sharedstrings_test.rb +9 -4
data/test/performance_test.rb +85 -88
data/test/shared_strings.xml +4 -0
data/test/simple_xlsx_reader_test.rb +785 -375
data/test/test_helper.rb +4 -1
data/test/test_xlsx_builder.rb +104 -0
metadata +16 -6

data/lib/simple_xlsx_reader/loader/sheet_parser.rb ADDED Viewed

@@ -0,0 +1,256 @@
+# frozen_string_literal: true
+require 'forwardable'
+module SimpleXlsxReader
+  class Loader
+    class SheetParser < Nokogiri::XML::SAX::Document
+      extend Forwardable
+      attr_accessor :xrels_file
+      attr_accessor :hyperlinks_by_cell
+      attr_reader :load_errors
+      def_delegators :@loader, :style_types, :shared_strings, :base_date
+      def initialize(file_io:, loader:)
+        @file_io = file_io
+        @loader = loader
+      end
+      def parse(headers: false, &block)
+        raise 'parse called without a block; what should this do?'\
+          unless block_given?
+        @headers = headers
+        @each_callback = block
+        @load_errors = {}
+        @current_row_num = nil
+        @last_seen_row_idx = 0
+        @url = nil # silence warnings
+        @function = nil # silence warnings
+        @capture = nil # silence warnings
+        @dimension = nil # silence warnings
+        @file_io.rewind # in case we've already parsed this once
+        # In this project this is only used for GUI-made hyperlinks (as opposed
+        # to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
+        # the spreadsheet, and they come AFTER the sheet data. So, solution is
+        # to just stream-parse the file twice, first for the hyperlinks at the
+        # bottom of the file, then for the file itself. In the future it would
+        # be clever to use grep to extract the xml into its own smaller file.
+        if xrels_file&.grep(/hyperlink/)&.any?
+          xrels_file.rewind
+          load_gui_hyperlinks # represented as hyperlinks_by_cell
+          @file_io.rewind
+        end
+        Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
+      end
+      ###
+      # SAX document hooks
+      def start_element(name, attrs = [])
+        case name
+        when 'dimension' then @dimension = attrs.last.last
+        when 'row'
+          @current_row_num = attrs.find {|(k, v)| k == 'r'}&.last&.to_i
+          @current_row = Array.new(column_length)
+        when 'c'
+          attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
+          @cell_name = attrs['r']
+          @type = attrs['t']
+          @style = attrs['s'] && style_types[attrs['s'].to_i]
+        when 'f' then @function = true
+        when 'v', 't' then @capture = true
+        end
+      end
+      def characters(string)
+        if @function
+          # the only "function" we support is a hyperlink
+          @url = string.slice(/HYPERLINK\("(.*?)"/, 1)
+        end
+        return unless @capture
+        @current_row[cell_idx] =
+          begin
+            SimpleXlsxReader::Loader.cast(
+              string.strip, @type, @style,
+              url: @url || hyperlinks_by_cell&.[](@cell_name),
+              shared_strings: shared_strings,
+              base_date: base_date
+            )
+          rescue StandardError => e
+            column, row = @cell_name.match(/([A-Z]+)([0-9]+)/).captures
+            col_idx = column_letter_to_number(column) - 1
+            row_idx = row.to_i - 1
+            if !SimpleXlsxReader.configuration.catch_cell_load_errors
+              error = CellLoadError.new(
+                "Row #{row_idx}, Col #{col_idx}: #{e.message}"
+              )
+              error.set_backtrace(e.backtrace)
+              raise error
+            else
+              @load_errors[[row_idx, col_idx]] = e.message
+              string.strip
+            end
+          end
+      end
+      def end_element(name)
+        case name
+        when 'row'
+          if @headers == true # ya a little funky
+            @headers = @current_row
+          elsif @headers.is_a?(Hash)
+            test_headers_hash_against_current_row
+            # in case there were empty rows before finding the header
+            @last_seen_row_idx = @current_row_num - 1
+          elsif @headers.respond_to?(:call)
+            @headers = @current_row if @headers.call(@current_row)
+            # in case there were empty rows before finding the header
+            @last_seen_row_idx = @current_row_num - 1
+          elsif @headers
+            possibly_yield_empty_rows(headers: true)
+            yield_row(@current_row, headers: true)
+          else
+            possibly_yield_empty_rows(headers: false)
+            yield_row(@current_row, headers: false)
+          end
+          @last_seen_row_idx += 1
+          # Note that excel writes a '/worksheet/dimension' node we can get
+          # this from, but some libs (ex. simple_xlsx_writer) don't record it.
+          # In that case, we assume the data is of uniform column length and
+          # store the column name of the last header row we see. Obviously this
+          # isn't the most robust strategy, but it likely fits 99% of use cases
+          # considering it's not a problem with actual excel docs.
+          @dimension = "A1:#{@cell_name}" if @dimension.nil?
+        when 'v', 't' then @capture = false
+        when 'f' then @function = false
+        when 'c' then @url = nil
+        end
+      end
+      ###
+      # /End SAX hooks
+      def test_headers_hash_against_current_row
+        found = false
+        @current_row.each_with_index do |cell, cell_idx|
+          @headers.each_pair do |key, search|
+            if search.is_a?(String) ? cell == search : cell&.match?(search)
+              found = true
+              @current_row[cell_idx] = key
+            end
+          end
+        end
+        @headers = @current_row if found
+      end
+      def possibly_yield_empty_rows(headers:)
+        while @current_row_num && @current_row_num > @last_seen_row_idx + 1
+          @last_seen_row_idx += 1
+          yield_row(Array.new(column_length), headers: headers)
+        end
+      end
+      def yield_row(row, headers:)
+        if headers
+          @each_callback.call(Hash[@headers.zip(row)])
+        else
+          @each_callback.call(row)
+        end
+      end
+      # This sax-parses the whole sheet, just to extract hyperlink refs at the end.
+      def load_gui_hyperlinks
+        self.hyperlinks_by_cell =
+          HyperlinksParser.parse(@file_io, xrels: xrels)
+      end
+      class HyperlinksParser < Nokogiri::XML::SAX::Document
+        def initialize(file_io, xrels:)
+          @file_io = file_io
+          @xrels = xrels
+        end
+        def self.parse(file_io, xrels:)
+          new(file_io, xrels: xrels).parse
+        end
+        def parse
+          @hyperlinks_by_cell = {}
+          Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
+          @hyperlinks_by_cell
+        end
+        def start_element(name, attrs)
+          case name
+          when 'hyperlink'
+            attrs = attrs.inject({}) {|acc, (k, v)| acc[k] = v; acc}
+            id = attrs['id'] || attrs['r:id']
+            @hyperlinks_by_cell[attrs['ref']] =
+              @xrels.at_xpath(%(//*[@Id="#{id}"])).attr('Target')
+          end
+        end
+      end
+      def xrels
+        @xrels ||= Nokogiri::XML(xrels_file.read) if xrels_file
+      end
+      def column_length
+        return 0 unless @dimension
+        @column_length ||= column_letter_to_number(last_cell_letter)
+      end
+      def cell_idx
+        column_letter_to_number(@cell_name.scan(/[A-Z]+/).first) - 1
+      end
+      ##
+      # Returns the last column name, ex. 'E'
+      def last_cell_letter
+        return unless @dimension
+        @dimension.scan(/:([A-Z]+)/)&.first&.first || 'A'
+      end
+      # formula fits an exponential factorial function of the form:
+      # 'A'   = 1
+      # 'B'   = 2
+      # 'Z'   = 26
+      # 'AA'  = 26 * 1  + 1
+      # 'AZ'  = 26 * 1  + 26
+      # 'BA'  = 26 * 2  + 1
+      # 'ZA'  = 26 * 26 + 1
+      # 'ZZ'  = 26 * 26 + 26
+      # 'AAA' = 26 * 26 * 1 + 26 * 1  + 1
+      # 'AAZ' = 26 * 26 * 1 + 26 * 1  + 26
+      # 'ABA' = 26 * 26 * 1 + 26 * 2  + 1
+      # 'BZA' = 26 * 26 * 2 + 26 * 26 + 1
+      def column_letter_to_number(column_letter)
+        pow = column_letter.length - 1
+        result = 0
+        column_letter.each_byte do |b|
+          result += 26**pow * (b - 64)
+          pow -= 1
+        end
+        result
+      end
+    end
+  end
+end

data/lib/simple_xlsx_reader/loader/style_types_parser.rb ADDED Viewed

@@ -0,0 +1,115 @@
+# frozen_string_literal: true
+module SimpleXlsxReader
+  class Loader
+    StyleTypesParser = Struct.new(:file_io) do
+      def self.parse(file_io)
+        new(file_io).tap(&:parse).style_types
+      end
+      # Map of non-custom numFmtId to casting symbol
+      NumFmtMap = {
+        0 => :string,         # General
+        1 => :fixnum,         # 0
+        2 => :float,          # 0.00
+        3 => :fixnum,         # #,##0
+        4 => :float,          # #,##0.00
+        5 => :unsupported,    # $#,##0_);($#,##0)
+        6 => :unsupported,    # $#,##0_);[Red]($#,##0)
+        7 => :unsupported,    # $#,##0.00_);($#,##0.00)
+        8 => :unsupported,    # $#,##0.00_);[Red]($#,##0.00)
+        9 => :percentage,     # 0%
+        10 => :percentage,     # 0.00%
+        11 => :bignum,         # 0.00E+00
+        12 => :unsupported,    # # ?/?
+        13 => :unsupported,    # # ??/??
+        14 => :date,           # mm-dd-yy
+        15 => :date,           # d-mmm-yy
+        16 => :date,           # d-mmm
+        17 => :date,           # mmm-yy
+        18 => :time,           # h:mm AM/PM
+        19 => :time,           # h:mm:ss AM/PM
+        20 => :time,           # h:mm
+        21 => :time,           # h:mm:ss
+        22 => :date_time,      # m/d/yy h:mm
+        37 => :unsupported,    # #,##0 ;(#,##0)
+        38 => :unsupported,    # #,##0 ;[Red](#,##0)
+        39 => :unsupported,    # #,##0.00;(#,##0.00)
+        40 => :unsupported,    # #,##0.00;[Red](#,##0.00)
+        45 => :time,           # mm:ss
+        46 => :time,           # [h]:mm:ss
+        47 => :time,           # mmss.0
+        48 => :bignum,         # ##0.0E+0
+        49 => :unsupported     # @
+      }.freeze
+      def parse
+        @xml = Nokogiri::XML(file_io.read).remove_namespaces!
+      end
+      # Excel doesn't record types for some cells, only its display style, so
+      # we have to back out the type from that style.
+      #
+      # Some of these styles can be determined from a known set (see NumFmtMap),
+      # while others are 'custom' and we have to make a best guess.
+      #
+      # This is the array of types corresponding to the styles a spreadsheet
+      # uses, and includes both the known style types and the custom styles.
+      #
+      # Note that the xml sheet cells that use this don't reference the
+      # numFmtId, but instead the array index of a style in the stored list of
+      # only the styles used in the spreadsheet (which can be either known or
+      # custom). Hence this style types array, rather than a map of numFmtId to
+      # type.
+      def style_types
+        @xml.xpath('/styleSheet/cellXfs/xf').map do |xstyle|
+          style_type_by_num_fmt_id(
+            xstyle.attributes['numFmtId']&.value
+          )
+        end
+      end
+      # Finds the type we think a style is; For example, fmtId 14 is a date
+      # style, so this would return :date.
+      #
+      # Note, custom styles usually (are supposed to?) have a numFmtId >= 164,
+      # but in practice can sometimes be simply out of the usual "Any Language"
+      # id range that goes up to 49. For example, I have seen a numFmtId of
+      # 59 specified as a date. In Thai, 59 is a number format, so this seems
+      # like a bad idea, but we try to be flexible and just go with it.
+      def style_type_by_num_fmt_id(id)
+        return nil if id.nil?
+        id = id.to_i
+        NumFmtMap[id] || custom_style_types[id]
+      end
+      # Map of (numFmtId >= 164) (custom styles) to our best guess at the type
+      # ex. {164 => :date_time}
+      def custom_style_types
+        @custom_style_types ||=
+          @xml.xpath('/styleSheet/numFmts/numFmt')
+            .each_with_object({}) do |xstyle, acc|
+              acc[xstyle.attributes['numFmtId'].value.to_i] =
+                determine_custom_style_type(xstyle.attributes['formatCode'].value)
+            end
+      end
+      # This is the least deterministic part of reading xlsx files. Due to
+      # custom styles, you can't know for sure when a date is a date other than
+      # looking at its format and gessing. It's not impossible to guess right,
+      # though.
+      #
+      # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets
+      def determine_custom_style_type(string)
+        return :float if string[0] == '_'
+        return :float if string[0] == ' 0'
+        # Looks for one of ymdhis outside of meta-stuff like [Red]
+        return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i
+        :unsupported
+      end
+    end
+  end
+end

data/lib/simple_xlsx_reader/loader/workbook_parser.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+module SimpleXlsxReader
+  class Loader
+    WorkbookParser = Struct.new(:file_io) do
+      def self.parse(file_io)
+        parser = new(file_io).tap(&:parse)
+        [parser.sheet_toc, parser.base_date]
+      end
+      def parse
+        @xml = Nokogiri::XML(file_io.read).remove_namespaces!
+      end
+      # Table of contents for the sheets, ex. {'Authors' => 0, ...}
+      def sheet_toc
+        @xml.xpath('/workbook/sheets/sheet')
+          .each_with_object({}) do |sheet, acc|
+            acc[sheet.attributes['name'].value] =
+              sheet.attributes['sheetId'].value.to_i - 1 # keep things 0-indexed
+          end
+      end
+      ## Returns the base_date from which to calculate dates.
+      # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if
+      # it's set in the Workbook's workbookPr.
+      # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
+      def base_date
+        return DATE_SYSTEM_1900 if @xml.nil?
+        @xml.xpath('//workbook/workbookPr[@date1904]').each do |workbookPr|
+          return DATE_SYSTEM_1904 if workbookPr['date1904'] =~ /true|1/i
+        end
+        DATE_SYSTEM_1900
+      end
+    end
+  end
+end

data/lib/simple_xlsx_reader/loader.rb ADDED Viewed

@@ -0,0 +1,199 @@
+# frozen_string_literal: true
+module SimpleXlsxReader
+  class Loader < Struct.new(:file_path)
+    attr_accessor :shared_strings, :sheet_parsers, :sheet_toc, :style_types, :base_date
+    def init_sheets
+      ZipReader.new(
+        file_path: file_path,
+        loader: self
+      ).read
+      sheet_toc.each_with_index.map do |(sheet_name, _sheet_number), i|
+        # sheet_number is *not* the index into xml.sheet_parsers
+        SimpleXlsxReader::Document::Sheet.new(
+          name: sheet_name,
+          sheet_parser: sheet_parsers[i]
+        )
+      end
+    end
+    ZipReader = Struct.new(:file_path, :loader, keyword_init: true) do
+      attr_reader :zip
+      def initialize(*args)
+        super
+        @zip = SimpleXlsxReader::Zip.open(file_path)
+      end
+      def read
+        entry_at('xl/workbook.xml') do |file_io|
+          loader.sheet_toc, loader.base_date = *WorkbookParser.parse(file_io)
+        end
+        entry_at('xl/styles.xml') do |file_io|
+          loader.style_types = StyleTypesParser.parse(file_io)
+        end
+        # optional feature used by excel,
+        # but not often used by xlsx generation libraries
+        if (ss_entry = entry_at('xl/sharedStrings.xml'))
+          ss_entry.get_input_stream do |file|
+            loader.shared_strings = SharedStringsParser.parse(file)
+          end
+        else
+          loader.shared_strings = []
+        end
+        loader.sheet_parsers = []
+        # Sometimes there's a zero-index sheet.xml, ex.
+        # Google Docs creates:
+        # xl/worksheets/sheet.xml
+        # xl/worksheets/sheet1.xml
+        # xl/worksheets/sheet2.xml
+        # While Excel creates:
+        # xl/worksheets/sheet1.xml
+        # xl/worksheets/sheet2.xml
+        add_sheet_parser_at_index(nil)
+        i = 1
+        while(add_sheet_parser_at_index(i)) do
+          i += 1
+        end
+      end
+      def entry_at(path, &block)
+        # Older and newer (post-mid-2021) RubyZip normalizes pathnames,
+        # but unfortunately there is a time in between where it doesn't.
+        # Rather than require a specific version, let's just be flexible.
+        entry =
+          zip.find_entry(path) || # *nix-generated
+          zip.find_entry(path.tr('/', '\\')) || # Windows-generated
+          zip.find_entry(path.downcase) || # Sometimes it's lowercase
+          zip.find_entry(path.tr('/', '\\').downcase) # Sometimes it's lowercase
+        if block
+          entry.get_input_stream(&block)
+        else
+          entry
+        end
+      end
+      def add_sheet_parser_at_index(i)
+        sheet_file_name = "xl/worksheets/sheet#{i}.xml"
+        return unless (entry = entry_at(sheet_file_name))
+        parser =
+          SheetParser.new(
+            file_io: entry.get_input_stream,
+            loader: loader
+          )
+        relationship_file_name = "xl/worksheets/_rels/sheet#{i}.xml.rels"
+        if (rel = entry_at(relationship_file_name))
+          parser.xrels_file = rel.get_input_stream
+        end
+        loader.sheet_parsers << parser
+      end
+    end
+    ##
+    # The heart of typecasting. The ruby type is determined either explicitly
+    # from the cell xml or implicitly from the cell style, and this
+    # method expects that work to have been done already. This, then,
+    # takes the type we determined it to be and casts the cell value
+    # to that type.
+    #
+    # types:
+    # - s: shared string (see #shared_string)
+    # - n: number (cast to a float)
+    # - b: boolean
+    # - str: string
+    # - inlineStr: string
+    # - ruby symbol: for when type has been determined by style
+    #
+    # options:
+    # - shared_strings: needed for 's' (shared string) type
+    def self.cast(value, type, style, options = {})
+      return nil if value.nil? || value.empty?
+      # Sometimes the type is dictated by the style alone
+      if type.nil? ||
+         (type == 'n' && %i[date time date_time].include?(style))
+        type = style
+      end
+      casted =
+        case type
+        ##
+        # There are few built-in types
+        ##
+        when 's' # shared string
+          options[:shared_strings][value.to_i]
+        when 'n' # number
+          value.to_f
+        when 'b'
+          value.to_i == 1
+        when 'str'
+          value
+        when 'inlineStr'
+          value
+        ##
+        # Type can also be determined by a style,
+        # detected earlier and cast here by its standardized symbol
+        ##
+        when :string, :unsupported
+          value
+        when :fixnum
+          value.to_i
+        when :float
+          value.to_f
+        when :percentage
+          value.to_f / 100
+        # the trickiest. note that  all these formats can vary on
+        # whether they actually contain a date, time, or datetime.
+        when :date, :time, :date_time
+          value = Float(value)
+          days_since_date_system_start = value.to_i
+          fraction_of_24 = value - days_since_date_system_start
+          # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby
+          date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start
+          if fraction_of_24 > 0 # there is a time associated
+            seconds = (fraction_of_24 * 86_400).round
+            return Time.utc(date.year, date.month, date.day) + seconds
+          else
+            return date
+          end
+        when :bignum
+          if defined?(BigDecimal)
+            BigDecimal(value)
+          else
+            value.to_f
+          end
+        ##
+        # Beats me
+        ##
+        else
+          value
+        end
+      if options[:url]
+        Hyperlink.new(options[:url], casted)
+      else
+        casted
+      end
+    end
+  end
+end

data/lib/simple_xlsx_reader/version.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module SimpleXlsxReader
-  VERSION = "1.0.5"
+  VERSION = '2.0.0'
 end