simple_xlsx_reader 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +38 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +22 -0
- data/README.md +190 -57
- data/Rakefile +3 -1
- data/lib/simple_xlsx_reader/document.rb +147 -0
- data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
- data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
- data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
- data/lib/simple_xlsx_reader/loader.rb +199 -0
- data/lib/simple_xlsx_reader/version.rb +3 -1
- data/lib/simple_xlsx_reader.rb +23 -442
- data/simple_xlsx_reader.gemspec +4 -2
- data/test/date1904_test.rb +5 -4
- data/test/datetime_test.rb +17 -10
- data/test/gdocs_sheet.xlsx +0 -0
- data/test/gdocs_sheet_test.rb +16 -0
- data/test/lower_case_sharedstrings_test.rb +9 -4
- data/test/performance_test.rb +86 -89
- data/test/sesame_street_blog.xlsx +0 -0
- data/test/shared_strings.xml +4 -0
- data/test/simple_xlsx_reader_test.rb +835 -320
- data/test/test_helper.rb +4 -1
- data/test/test_xlsx_builder.rb +104 -0
- metadata +38 -9
    
        data/test/performance_test.rb
    CHANGED
    
    | @@ -1,111 +1,108 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require_relative 'test_helper'
         | 
| 2 4 | 
             
            require 'minitest/benchmark'
         | 
| 3 5 |  | 
| 4 6 | 
             
            describe 'SimpleXlsxReader Benchmark' do
         | 
| 5 | 
            -
             | 
| 6 7 | 
             
              # n is 0-indexed for us, then converted to 1-indexed for excel
         | 
| 7 | 
            -
              def  | 
| 8 | 
            -
                 | 
| 9 | 
            -
                 | 
| 10 | 
            -
                   | 
| 11 | 
            -
                    <c r='A#{n}' s='0'>
         | 
| 12 | 
            -
                      <v>Cell A#{n}</v>
         | 
| 13 | 
            -
                    </c>
         | 
| 14 | 
            -
                    <c r='B#{n}' s='1'>
         | 
| 15 | 
            -
                      <v>2.4</v>
         | 
| 16 | 
            -
                    </c>
         | 
| 17 | 
            -
                    <c r='C#{n}' s='2'>
         | 
| 18 | 
            -
                      <v>30687</v>
         | 
| 19 | 
            -
                    </c>
         | 
| 20 | 
            -
                    <c r='D#{n}' t='inlineStr' s='0'>
         | 
| 21 | 
            -
                      <is><t>Cell D#{n}</t></is>
         | 
| 22 | 
            -
                    </c>
         | 
| 23 | 
            -
             | 
| 24 | 
            -
                    <c r='E#{n}' s='0'>
         | 
| 25 | 
            -
                      <v>Cell E#{n}</v>
         | 
| 26 | 
            -
                    </c>
         | 
| 27 | 
            -
                    <c r='F#{n}' s='1'>
         | 
| 28 | 
            -
                      <v>2.4</v>
         | 
| 29 | 
            -
                    </c>
         | 
| 30 | 
            -
                    <c r='G#{n}' s='2'>
         | 
| 31 | 
            -
                      <v>30687</v>
         | 
| 32 | 
            -
                    </c>
         | 
| 33 | 
            -
                    <c r='H#{n}' t='inlineStr' s='0'>
         | 
| 34 | 
            -
                      <is><t>Cell H#{n}</t></is>
         | 
| 35 | 
            -
                    </c>
         | 
| 36 | 
            -
             | 
| 37 | 
            -
                    <c r='I#{n}' s='0'>
         | 
| 38 | 
            -
                      <v>Cell I#{n}</v>
         | 
| 39 | 
            -
                    </c>
         | 
| 40 | 
            -
                    <c r='J#{n}' s='1'>
         | 
| 41 | 
            -
                      <v>2.4</v>
         | 
| 42 | 
            -
                    </c>
         | 
| 43 | 
            -
                    <c r='K#{n}' s='2'>
         | 
| 44 | 
            -
                      <v>30687</v>
         | 
| 45 | 
            -
                    </c>
         | 
| 46 | 
            -
                    <c r='L#{n}' t='inlineStr' s='0'>
         | 
| 47 | 
            -
                      <is><t>Cell L#{n}</t></is>
         | 
| 48 | 
            -
                    </c>
         | 
| 49 | 
            -
                  </row>
         | 
| 50 | 
            -
                XML
         | 
| 51 | 
            -
              end
         | 
| 52 | 
            -
             | 
| 53 | 
            -
              before do
         | 
| 54 | 
            -
                base = Nokogiri::XML(
         | 
| 55 | 
            -
                  <<-XML
         | 
| 8 | 
            +
              def sheet_with_n_rows(row_count)
         | 
| 9 | 
            +
                acc = +""
         | 
| 10 | 
            +
                acc <<
         | 
| 11 | 
            +
                  <<~XML
         | 
| 56 12 | 
             
                    <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
         | 
| 57 13 | 
             
                      <sheetData>
         | 
| 58 | 
            -
                      </sheetData>
         | 
| 59 | 
            -
                    </worksheet>
         | 
| 60 14 | 
             
                  XML
         | 
| 61 | 
            -
                ).remove_namespaces!
         | 
| 62 | 
            -
                base.at_xpath("/worksheet/sheetData").add_child(build_row(0))
         | 
| 63 15 |  | 
| 64 | 
            -
                 | 
| 65 | 
            -
                   | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
                        < | 
| 73 | 
            -
                          < | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 16 | 
            +
                row_count.times.each do |n|
         | 
| 17 | 
            +
                  n += 1
         | 
| 18 | 
            +
                  acc <<
         | 
| 19 | 
            +
                    <<~XML
         | 
| 20 | 
            +
                      <row>
         | 
| 21 | 
            +
                        <c r='A#{n}' s='0'>
         | 
| 22 | 
            +
                          <v>Cell A#{n}</v>
         | 
| 23 | 
            +
                        </c>
         | 
| 24 | 
            +
                        <c r='B#{n}' s='1'>
         | 
| 25 | 
            +
                          <v>2.4</v>
         | 
| 26 | 
            +
                        </c>
         | 
| 27 | 
            +
                        <c r='C#{n}' s='2'>
         | 
| 28 | 
            +
                          <v>30687</v>
         | 
| 29 | 
            +
                        </c>
         | 
| 30 | 
            +
                        <c r='D#{n}' t='inlineStr' s='0'>
         | 
| 31 | 
            +
                          <is><t>Cell D#{n}</t></is>
         | 
| 32 | 
            +
                        </c>
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                        <c r='E#{n}' s='0'>
         | 
| 35 | 
            +
                          <v>Cell E#{n}</v>
         | 
| 36 | 
            +
                        </c>
         | 
| 37 | 
            +
                        <c r='F#{n}' s='1'>
         | 
| 38 | 
            +
                          <v>2.4</v>
         | 
| 39 | 
            +
                        </c>
         | 
| 40 | 
            +
                        <c r='G#{n}' s='2'>
         | 
| 41 | 
            +
                          <v>30687</v>
         | 
| 42 | 
            +
                        </c>
         | 
| 43 | 
            +
                        <c r='H#{n}' t='inlineStr' s='0'>
         | 
| 44 | 
            +
                          <is><t>Cell H#{n}</t></is>
         | 
| 45 | 
            +
                        </c>
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                        <c r='I#{n}' s='0'>
         | 
| 48 | 
            +
                          <v>Cell I#{n}</v>
         | 
| 49 | 
            +
                        </c>
         | 
| 50 | 
            +
                        <c r='J#{n}' s='1'>
         | 
| 51 | 
            +
                          <v>2.4</v>
         | 
| 52 | 
            +
                        </c>
         | 
| 53 | 
            +
                        <c r='K#{n}' s='2'>
         | 
| 54 | 
            +
                          <v>30687</v>
         | 
| 55 | 
            +
                        </c>
         | 
| 56 | 
            +
                        <c r='L#{n}' t='inlineStr' s='0'>
         | 
| 57 | 
            +
                          <is><t>Cell L#{n}</t></is>
         | 
| 58 | 
            +
                        </c>
         | 
| 59 | 
            +
                      </row>
         | 
| 78 60 | 
             
                    XML
         | 
| 79 | 
            -
                  ).remove_namespaces!
         | 
| 80 61 | 
             
                end
         | 
| 81 62 |  | 
| 82 | 
            -
                 | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 63 | 
            +
                acc <<
         | 
| 64 | 
            +
                  <<~XML
         | 
| 65 | 
            +
                      </sheetData>
         | 
| 66 | 
            +
                    </worksheet>
         | 
| 67 | 
            +
                  XML
         | 
| 68 | 
            +
              end
         | 
| 85 69 |  | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 70 | 
            +
              let(:styles) do
         | 
| 71 | 
            +
                # s='0' above refers to the value of numFmtId at cellXfs index 0,
         | 
| 72 | 
            +
                # which is in this case 'General' type
         | 
| 73 | 
            +
                styles =
         | 
| 74 | 
            +
                  <<-XML
         | 
| 75 | 
            +
                    <styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
         | 
| 76 | 
            +
                      <cellXfs count="1">
         | 
| 77 | 
            +
                        <xf numFmtId="0" />
         | 
| 78 | 
            +
                        <xf numFmtId="2" />
         | 
| 79 | 
            +
                        <xf numFmtId="14" />
         | 
| 80 | 
            +
                      </cellXfs>
         | 
| 81 | 
            +
                    </styleSheet>
         | 
| 82 | 
            +
                  XML
         | 
| 83 | 
            +
              end
         | 
| 90 84 |  | 
| 91 | 
            -
             | 
| 85 | 
            +
              before do
         | 
| 86 | 
            +
                @xlsxs = {}
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                # Every new sheet has one more row
         | 
| 89 | 
            +
                self.class.bench_range.each do |num_rows|
         | 
| 90 | 
            +
                  @xlsxs[num_rows] =
         | 
| 91 | 
            +
                    TestXlsxBuilder.new(
         | 
| 92 | 
            +
                      sheets: [sheet_with_n_rows(num_rows)],
         | 
| 93 | 
            +
                      styles: styles
         | 
| 94 | 
            +
                    ).archive
         | 
| 92 95 | 
             
                end
         | 
| 93 96 | 
             
              end
         | 
| 94 97 |  | 
| 95 98 | 
             
              def self.bench_range
         | 
| 96 | 
            -
                 | 
| 99 | 
            +
                # Works out to a max just shy of 265k rows, which takes ~20s on my M1 Mac.
         | 
| 100 | 
            +
                # Second-largest is ~65k rows @ ~5s.
         | 
| 101 | 
            +
                max = ENV['BIG_PERF_TEST'] ? 265_000 : 66_000
         | 
| 102 | 
            +
                bench_exp(100, max, 4)
         | 
| 97 103 | 
             
              end
         | 
| 98 104 |  | 
| 99 | 
            -
              bench_performance_linear 'parses sheets in linear time', 0. | 
| 100 | 
            -
             | 
| 101 | 
            -
                raise "not enough sample data; asked for #{n}, only have #{@xml.sheets.size}"\
         | 
| 102 | 
            -
                  if @xml.sheets[n].nil?
         | 
| 103 | 
            -
             | 
| 104 | 
            -
                sheet = SimpleXlsxReader::Document::Mapper.new(@xml).
         | 
| 105 | 
            -
                  parse_sheet('test', @xml.sheets[n])
         | 
| 106 | 
            -
             | 
| 107 | 
            -
                raise "sheet didn't parse correctly; expected #{n + 1} rows, got #{sheet.rows.size}"\
         | 
| 108 | 
            -
                  if sheet.rows.size != n + 1
         | 
| 105 | 
            +
              bench_performance_linear 'parses sheets in linear time', 0.999 do |n|
         | 
| 106 | 
            +
                SimpleXlsxReader.open(@xlsxs[n].path).sheets[0].rows.each(headers: true) {|_row| }
         | 
| 109 107 | 
             
              end
         | 
| 110 | 
            -
             | 
| 111 108 | 
             
            end
         | 
| Binary file | 
    
        data/test/shared_strings.xml
    CHANGED
    
    | @@ -77,4 +77,8 @@ | |
| 77 77 | 
             
                <si>
         | 
| 78 78 | 
             
                    <t>Cell Fmt</t>
         | 
| 79 79 | 
             
                </si>
         | 
| 80 | 
            +
                <si>
         | 
| 81 | 
            +
                  <t>’ When it sees a unicode character (such as the fancy apostrophe starting this sentence), it starts chunking the stream for at least the current node, and we have to keep consuming the characters until we hit the end of the text. We can't assume that the string first given by the SAX callback us is the whole shared string content. It only happens with both unicode *and* really long text.
         | 
| 82 | 
            +
                  </t>
         | 
| 83 | 
            +
                </si>
         | 
| 80 84 | 
             
            </sst>
         |