simple_xlsx_reader 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,111 +1,108 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'test_helper'
2
4
  require 'minitest/benchmark'
3
5
 
4
6
  describe 'SimpleXlsxReader Benchmark' do
5
-
6
7
  # n is 0-indexed for us, then converted to 1-indexed for excel
7
- def build_row(n)
8
- n += 1
9
- <<-XML
10
- <row>
11
- <c r='A#{n}' s='0'>
12
- <v>Cell A#{n}</v>
13
- </c>
14
- <c r='B#{n}' s='1'>
15
- <v>2.4</v>
16
- </c>
17
- <c r='C#{n}' s='2'>
18
- <v>30687</v>
19
- </c>
20
- <c r='D#{n}' t='inlineStr' s='0'>
21
- <is><t>Cell D#{n}</t></is>
22
- </c>
23
-
24
- <c r='E#{n}' s='0'>
25
- <v>Cell E#{n}</v>
26
- </c>
27
- <c r='F#{n}' s='1'>
28
- <v>2.4</v>
29
- </c>
30
- <c r='G#{n}' s='2'>
31
- <v>30687</v>
32
- </c>
33
- <c r='H#{n}' t='inlineStr' s='0'>
34
- <is><t>Cell H#{n}</t></is>
35
- </c>
36
-
37
- <c r='I#{n}' s='0'>
38
- <v>Cell I#{n}</v>
39
- </c>
40
- <c r='J#{n}' s='1'>
41
- <v>2.4</v>
42
- </c>
43
- <c r='K#{n}' s='2'>
44
- <v>30687</v>
45
- </c>
46
- <c r='L#{n}' t='inlineStr' s='0'>
47
- <is><t>Cell L#{n}</t></is>
48
- </c>
49
- </row>
50
- XML
51
- end
52
-
53
- before do
54
- base = Nokogiri::XML(
55
- <<-XML
8
+ def sheet_with_n_rows(row_count)
9
+ acc = +""
10
+ acc <<
11
+ <<~XML
56
12
  <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
57
13
  <sheetData>
58
- </sheetData>
59
- </worksheet>
60
14
  XML
61
- ).remove_namespaces!
62
- base.at_xpath("/worksheet/sheetData").add_child(build_row(0))
63
15
 
64
- @xml = SimpleXlsxReader::Document::Xml.new.tap do |xml|
65
- xml.sheets = [base]
66
-
67
- # s='0' above refers to the value of numFmtId at cellXfs index 0,
68
- # which is in this case 'General' type
69
- xml.styles = Nokogiri::XML(
70
- <<-XML
71
- <styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
72
- <cellXfs count="1">
73
- <xf numFmtId="0" />
74
- <xf numFmtId="2" />
75
- <xf numFmtId="14" />
76
- </cellXfs>
77
- </styleSheet>
16
+ row_count.times.each do |n|
17
+ n += 1
18
+ acc <<
19
+ <<~XML
20
+ <row>
21
+ <c r='A#{n}' s='0'>
22
+ <v>Cell A#{n}</v>
23
+ </c>
24
+ <c r='B#{n}' s='1'>
25
+ <v>2.4</v>
26
+ </c>
27
+ <c r='C#{n}' s='2'>
28
+ <v>30687</v>
29
+ </c>
30
+ <c r='D#{n}' t='inlineStr' s='0'>
31
+ <is><t>Cell D#{n}</t></is>
32
+ </c>
33
+
34
+ <c r='E#{n}' s='0'>
35
+ <v>Cell E#{n}</v>
36
+ </c>
37
+ <c r='F#{n}' s='1'>
38
+ <v>2.4</v>
39
+ </c>
40
+ <c r='G#{n}' s='2'>
41
+ <v>30687</v>
42
+ </c>
43
+ <c r='H#{n}' t='inlineStr' s='0'>
44
+ <is><t>Cell H#{n}</t></is>
45
+ </c>
46
+
47
+ <c r='I#{n}' s='0'>
48
+ <v>Cell I#{n}</v>
49
+ </c>
50
+ <c r='J#{n}' s='1'>
51
+ <v>2.4</v>
52
+ </c>
53
+ <c r='K#{n}' s='2'>
54
+ <v>30687</v>
55
+ </c>
56
+ <c r='L#{n}' t='inlineStr' s='0'>
57
+ <is><t>Cell L#{n}</t></is>
58
+ </c>
59
+ </row>
78
60
  XML
79
- ).remove_namespaces!
80
61
  end
81
62
 
82
- # Every new sheet has one more row
83
- self.class.bench_range.each do |range|
84
- sheet = base.clone
63
+ acc <<
64
+ <<~XML
65
+ </sheetData>
66
+ </worksheet>
67
+ XML
68
+ end
85
69
 
86
- range.times do |n|
87
- sheet.xpath("/worksheet/sheetData/row").last.
88
- add_next_sibling(build_row(n+1))
89
- end
70
+ let(:styles) do
71
+ # s='0' above refers to the value of numFmtId at cellXfs index 0,
72
+ # which is in this case 'General' type
73
+ styles =
74
+ <<-XML
75
+ <styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
76
+ <cellXfs count="1">
77
+ <xf numFmtId="0" />
78
+ <xf numFmtId="2" />
79
+ <xf numFmtId="14" />
80
+ </cellXfs>
81
+ </styleSheet>
82
+ XML
83
+ end
90
84
 
91
- @xml.sheets[range] = sheet
85
+ before do
86
+ @xlsxs = {}
87
+
88
+ # Every new sheet has one more row
89
+ self.class.bench_range.each do |num_rows|
90
+ @xlsxs[num_rows] =
91
+ TestXlsxBuilder.new(
92
+ sheets: [sheet_with_n_rows(num_rows)],
93
+ styles: styles
94
+ ).archive
92
95
  end
93
96
  end
94
97
 
95
98
  def self.bench_range
96
- bench_exp(1,10000)
99
+ # Works out to a max just shy of 265k rows, which takes ~20s on my M1 Mac.
100
+ # Second-largest is ~65k rows @ ~5s.
101
+ max = ENV['BIG_PERF_TEST'] ? 265_000 : 66_000
102
+ bench_exp(100, max, 4)
97
103
  end
98
104
 
99
- bench_performance_linear 'parses sheets in linear time', 0.9999 do |n|
100
-
101
- raise "not enough sample data; asked for #{n}, only have #{@xml.sheets.size}"\
102
- if @xml.sheets[n].nil?
103
-
104
- sheet = SimpleXlsxReader::Document::Mapper.new(@xml).
105
- parse_sheet('test', @xml.sheets[n])
106
-
107
- raise "sheet didn't parse correctly; expected #{n + 1} rows, got #{sheet.rows.size}"\
108
- if sheet.rows.size != n + 1
105
+ bench_performance_linear 'parses sheets in linear time', 0.999 do |n|
106
+ SimpleXlsxReader.open(@xlsxs[n].path).sheets[0].rows.each(headers: true) {|_row| }
109
107
  end
110
-
111
108
  end
Binary file
@@ -77,4 +77,8 @@
77
77
  <si>
78
78
  <t>Cell Fmt</t>
79
79
  </si>
80
+ <si>
81
+ <t>’ When it sees a unicode character (such as the fancy apostrophe starting this sentence), it starts chunking the stream for at least the current node, and we have to keep consuming the characters until we hit the end of the text. We can't assume that the string first given by the SAX callback us is the whole shared string content. It only happens with both unicode *and* really long text.
82
+ </t>
83
+ </si>
80
84
  </sst>