RubyGems - simple_xlsx_reader - Versions diffs - 1.0.2 → 2.0.0 - Mend

simple_xlsx_reader 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +5 -5
data/.github/workflows/ruby.yml +38 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +22 -0
data/README.md +190 -57
data/Rakefile +3 -1
data/lib/simple_xlsx_reader/document.rb +147 -0
data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
data/lib/simple_xlsx_reader/loader.rb +199 -0
data/lib/simple_xlsx_reader/version.rb +3 -1
data/lib/simple_xlsx_reader.rb +23 -442
data/simple_xlsx_reader.gemspec +4 -2
data/test/date1904_test.rb +5 -4
data/test/datetime_test.rb +17 -10
data/test/gdocs_sheet.xlsx +0 -0
data/test/gdocs_sheet_test.rb +16 -0
data/test/lower_case_sharedstrings_test.rb +9 -4
data/test/performance_test.rb +86 -89
data/test/sesame_street_blog.xlsx +0 -0
data/test/shared_strings.xml +4 -0
data/test/simple_xlsx_reader_test.rb +835 -320
data/test/test_helper.rb +4 -1
data/test/test_xlsx_builder.rb +104 -0
metadata +38 -9

data/test/performance_test.rb CHANGED Viewed

@@ -1,111 +1,108 @@
+# frozen_string_literal: true
 require_relative 'test_helper'
 require 'minitest/benchmark'
 describe 'SimpleXlsxReader Benchmark' do
   # n is 0-indexed for us, then converted to 1-indexed for excel
-  def build_row(n)
-    n += 1
-    <<-XML
-      <row>
-        <c r='A#{n}' s='0'>
-          <v>Cell A#{n}</v>
-        </c>
-        <c r='B#{n}' s='1'>
-          <v>2.4</v>
-        </c>
-        <c r='C#{n}' s='2'>
-          <v>30687</v>
-        </c>
-        <c r='D#{n}' t='inlineStr' s='0'>
-          <is><t>Cell D#{n}</t></is>
-        </c>
-        <c r='E#{n}' s='0'>
-          <v>Cell E#{n}</v>
-        </c>
-        <c r='F#{n}' s='1'>
-          <v>2.4</v>
-        </c>
-        <c r='G#{n}' s='2'>
-          <v>30687</v>
-        </c>
-        <c r='H#{n}' t='inlineStr' s='0'>
-          <is><t>Cell H#{n}</t></is>
-        </c>
-        <c r='I#{n}' s='0'>
-          <v>Cell I#{n}</v>
-        </c>
-        <c r='J#{n}' s='1'>
-          <v>2.4</v>
-        </c>
-        <c r='K#{n}' s='2'>
-          <v>30687</v>
-        </c>
-        <c r='L#{n}' t='inlineStr' s='0'>
-          <is><t>Cell L#{n}</t></is>
-        </c>
-      </row>
-    XML
-  end
-  before do
-    base = Nokogiri::XML(
-      <<-XML
+  def sheet_with_n_rows(row_count)
+    acc = +""
+    acc <<
+      <<~XML
         <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
           <sheetData>
-          </sheetData>
-        </worksheet>
       XML
-    ).remove_namespaces!
-    base.at_xpath("/worksheet/sheetData").add_child(build_row(0))
-    @xml = SimpleXlsxReader::Document::Xml.new.tap do |xml|
-      xml.sheets = [base]
-      # s='0' above refers to the value of numFmtId at cellXfs index 0,
-      # which is in this case 'General' type
-      xml.styles = Nokogiri::XML(
-        <<-XML
-          <styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
-            <cellXfs count="1">
-              <xf numFmtId="0" />
-              <xf numFmtId="2" />
-              <xf numFmtId="14" />
-            </cellXfs>
-          </styleSheet>
+    row_count.times.each do |n|
+      n += 1
+      acc <<
+        <<~XML
+          <row>
+            <c r='A#{n}' s='0'>
+              <v>Cell A#{n}</v>
+            </c>
+            <c r='B#{n}' s='1'>
+              <v>2.4</v>
+            </c>
+            <c r='C#{n}' s='2'>
+              <v>30687</v>
+            </c>
+            <c r='D#{n}' t='inlineStr' s='0'>
+              <is><t>Cell D#{n}</t></is>
+            </c>
+            <c r='E#{n}' s='0'>
+              <v>Cell E#{n}</v>
+            </c>
+            <c r='F#{n}' s='1'>
+              <v>2.4</v>
+            </c>
+            <c r='G#{n}' s='2'>
+              <v>30687</v>
+            </c>
+            <c r='H#{n}' t='inlineStr' s='0'>
+              <is><t>Cell H#{n}</t></is>
+            </c>
+            <c r='I#{n}' s='0'>
+              <v>Cell I#{n}</v>
+            </c>
+            <c r='J#{n}' s='1'>
+              <v>2.4</v>
+            </c>
+            <c r='K#{n}' s='2'>
+              <v>30687</v>
+            </c>
+            <c r='L#{n}' t='inlineStr' s='0'>
+              <is><t>Cell L#{n}</t></is>
+            </c>
+          </row>
         XML
-      ).remove_namespaces!
     end
-    # Every new sheet has one more row
-    self.class.bench_range.each do |range|
-      sheet = base.clone
+    acc <<
+      <<~XML
+          </sheetData>
+        </worksheet>
+      XML
+  end
-      range.times do |n|
-        sheet.xpath("/worksheet/sheetData/row").last.
-          add_next_sibling(build_row(n+1))
-      end
+  let(:styles) do
+    # s='0' above refers to the value of numFmtId at cellXfs index 0,
+    # which is in this case 'General' type
+    styles =
+      <<-XML
+        <styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+          <cellXfs count="1">
+            <xf numFmtId="0" />
+            <xf numFmtId="2" />
+            <xf numFmtId="14" />
+          </cellXfs>
+        </styleSheet>
+      XML
+  end
-      @xml.sheets[range] = sheet
+  before do
+    @xlsxs = {}
+    # Every new sheet has one more row
+    self.class.bench_range.each do |num_rows|
+      @xlsxs[num_rows] =
+        TestXlsxBuilder.new(
+          sheets: [sheet_with_n_rows(num_rows)],
+          styles: styles
+        ).archive
     end
   end
   def self.bench_range
-    bench_exp(1,10000)
+    # Works out to a max just shy of 265k rows, which takes ~20s on my M1 Mac.
+    # Second-largest is ~65k rows @ ~5s.
+    max = ENV['BIG_PERF_TEST'] ? 265_000 : 66_000
+    bench_exp(100, max, 4)
   end
-  bench_performance_linear 'parses sheets in linear time', 0.9999 do |n|
-    raise "not enough sample data; asked for #{n}, only have #{@xml.sheets.size}"\
-      if @xml.sheets[n].nil?
-    sheet = SimpleXlsxReader::Document::Mapper.new(@xml).
-      parse_sheet('test', @xml.sheets[n])
-    raise "sheet didn't parse correctly; expected #{n + 1} rows, got #{sheet.rows.size}"\
-      if sheet.rows.size != n + 1
+  bench_performance_linear 'parses sheets in linear time', 0.999 do |n|
+    SimpleXlsxReader.open(@xlsxs[n].path).sheets[0].rows.each(headers: true) {|_row| }
   end
 end

data/test/sesame_street_blog.xlsx CHANGED Viewed

Binary file

data/test/shared_strings.xml CHANGED Viewed

@@ -77,4 +77,8 @@
     <si>
         <t>Cell Fmt</t>
     </si>
+    <si>
+      <t>’ When it sees a unicode character (such as the fancy apostrophe starting this sentence), it starts chunking the stream for at least the current node, and we have to keep consuming the characters until we hit the end of the text. We can't assume that the string first given by the SAX callback us is the whole shared string content. It only happens with both unicode *and* really long text.
+      </t>
+    </si>
 </sst>