simple_xlsx_reader 1.0.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +38 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +22 -0
- data/README.md +190 -57
- data/Rakefile +3 -1
- data/lib/simple_xlsx_reader/document.rb +147 -0
- data/lib/simple_xlsx_reader/hyperlink.rb +30 -0
- data/lib/simple_xlsx_reader/loader/shared_strings_parser.rb +46 -0
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +256 -0
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +115 -0
- data/lib/simple_xlsx_reader/loader/workbook_parser.rb +39 -0
- data/lib/simple_xlsx_reader/loader.rb +199 -0
- data/lib/simple_xlsx_reader/version.rb +3 -1
- data/lib/simple_xlsx_reader.rb +23 -442
- data/simple_xlsx_reader.gemspec +4 -2
- data/test/date1904_test.rb +5 -4
- data/test/datetime_test.rb +17 -10
- data/test/gdocs_sheet.xlsx +0 -0
- data/test/gdocs_sheet_test.rb +16 -0
- data/test/lower_case_sharedstrings_test.rb +9 -4
- data/test/performance_test.rb +86 -89
- data/test/sesame_street_blog.xlsx +0 -0
- data/test/shared_strings.xml +4 -0
- data/test/simple_xlsx_reader_test.rb +835 -320
- data/test/test_helper.rb +4 -1
- data/test/test_xlsx_builder.rb +104 -0
- metadata +38 -9
data/test/performance_test.rb
CHANGED
@@ -1,111 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'test_helper'
|
2
4
|
require 'minitest/benchmark'
|
3
5
|
|
4
6
|
describe 'SimpleXlsxReader Benchmark' do
|
5
|
-
|
6
7
|
# n is 0-indexed for us, then converted to 1-indexed for excel
|
7
|
-
def
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
<c r='A#{n}' s='0'>
|
12
|
-
<v>Cell A#{n}</v>
|
13
|
-
</c>
|
14
|
-
<c r='B#{n}' s='1'>
|
15
|
-
<v>2.4</v>
|
16
|
-
</c>
|
17
|
-
<c r='C#{n}' s='2'>
|
18
|
-
<v>30687</v>
|
19
|
-
</c>
|
20
|
-
<c r='D#{n}' t='inlineStr' s='0'>
|
21
|
-
<is><t>Cell D#{n}</t></is>
|
22
|
-
</c>
|
23
|
-
|
24
|
-
<c r='E#{n}' s='0'>
|
25
|
-
<v>Cell E#{n}</v>
|
26
|
-
</c>
|
27
|
-
<c r='F#{n}' s='1'>
|
28
|
-
<v>2.4</v>
|
29
|
-
</c>
|
30
|
-
<c r='G#{n}' s='2'>
|
31
|
-
<v>30687</v>
|
32
|
-
</c>
|
33
|
-
<c r='H#{n}' t='inlineStr' s='0'>
|
34
|
-
<is><t>Cell H#{n}</t></is>
|
35
|
-
</c>
|
36
|
-
|
37
|
-
<c r='I#{n}' s='0'>
|
38
|
-
<v>Cell I#{n}</v>
|
39
|
-
</c>
|
40
|
-
<c r='J#{n}' s='1'>
|
41
|
-
<v>2.4</v>
|
42
|
-
</c>
|
43
|
-
<c r='K#{n}' s='2'>
|
44
|
-
<v>30687</v>
|
45
|
-
</c>
|
46
|
-
<c r='L#{n}' t='inlineStr' s='0'>
|
47
|
-
<is><t>Cell L#{n}</t></is>
|
48
|
-
</c>
|
49
|
-
</row>
|
50
|
-
XML
|
51
|
-
end
|
52
|
-
|
53
|
-
before do
|
54
|
-
base = Nokogiri::XML(
|
55
|
-
<<-XML
|
8
|
+
def sheet_with_n_rows(row_count)
|
9
|
+
acc = +""
|
10
|
+
acc <<
|
11
|
+
<<~XML
|
56
12
|
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
57
13
|
<sheetData>
|
58
|
-
</sheetData>
|
59
|
-
</worksheet>
|
60
14
|
XML
|
61
|
-
).remove_namespaces!
|
62
|
-
base.at_xpath("/worksheet/sheetData").add_child(build_row(0))
|
63
15
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
<
|
73
|
-
<
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
16
|
+
row_count.times.each do |n|
|
17
|
+
n += 1
|
18
|
+
acc <<
|
19
|
+
<<~XML
|
20
|
+
<row>
|
21
|
+
<c r='A#{n}' s='0'>
|
22
|
+
<v>Cell A#{n}</v>
|
23
|
+
</c>
|
24
|
+
<c r='B#{n}' s='1'>
|
25
|
+
<v>2.4</v>
|
26
|
+
</c>
|
27
|
+
<c r='C#{n}' s='2'>
|
28
|
+
<v>30687</v>
|
29
|
+
</c>
|
30
|
+
<c r='D#{n}' t='inlineStr' s='0'>
|
31
|
+
<is><t>Cell D#{n}</t></is>
|
32
|
+
</c>
|
33
|
+
|
34
|
+
<c r='E#{n}' s='0'>
|
35
|
+
<v>Cell E#{n}</v>
|
36
|
+
</c>
|
37
|
+
<c r='F#{n}' s='1'>
|
38
|
+
<v>2.4</v>
|
39
|
+
</c>
|
40
|
+
<c r='G#{n}' s='2'>
|
41
|
+
<v>30687</v>
|
42
|
+
</c>
|
43
|
+
<c r='H#{n}' t='inlineStr' s='0'>
|
44
|
+
<is><t>Cell H#{n}</t></is>
|
45
|
+
</c>
|
46
|
+
|
47
|
+
<c r='I#{n}' s='0'>
|
48
|
+
<v>Cell I#{n}</v>
|
49
|
+
</c>
|
50
|
+
<c r='J#{n}' s='1'>
|
51
|
+
<v>2.4</v>
|
52
|
+
</c>
|
53
|
+
<c r='K#{n}' s='2'>
|
54
|
+
<v>30687</v>
|
55
|
+
</c>
|
56
|
+
<c r='L#{n}' t='inlineStr' s='0'>
|
57
|
+
<is><t>Cell L#{n}</t></is>
|
58
|
+
</c>
|
59
|
+
</row>
|
78
60
|
XML
|
79
|
-
).remove_namespaces!
|
80
61
|
end
|
81
62
|
|
82
|
-
|
83
|
-
|
84
|
-
|
63
|
+
acc <<
|
64
|
+
<<~XML
|
65
|
+
</sheetData>
|
66
|
+
</worksheet>
|
67
|
+
XML
|
68
|
+
end
|
85
69
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
70
|
+
let(:styles) do
|
71
|
+
# s='0' above refers to the value of numFmtId at cellXfs index 0,
|
72
|
+
# which is in this case 'General' type
|
73
|
+
styles =
|
74
|
+
<<-XML
|
75
|
+
<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
76
|
+
<cellXfs count="1">
|
77
|
+
<xf numFmtId="0" />
|
78
|
+
<xf numFmtId="2" />
|
79
|
+
<xf numFmtId="14" />
|
80
|
+
</cellXfs>
|
81
|
+
</styleSheet>
|
82
|
+
XML
|
83
|
+
end
|
90
84
|
|
91
|
-
|
85
|
+
before do
|
86
|
+
@xlsxs = {}
|
87
|
+
|
88
|
+
# Every new sheet has one more row
|
89
|
+
self.class.bench_range.each do |num_rows|
|
90
|
+
@xlsxs[num_rows] =
|
91
|
+
TestXlsxBuilder.new(
|
92
|
+
sheets: [sheet_with_n_rows(num_rows)],
|
93
|
+
styles: styles
|
94
|
+
).archive
|
92
95
|
end
|
93
96
|
end
|
94
97
|
|
95
98
|
def self.bench_range
|
96
|
-
|
99
|
+
# Works out to a max just shy of 265k rows, which takes ~20s on my M1 Mac.
|
100
|
+
# Second-largest is ~65k rows @ ~5s.
|
101
|
+
max = ENV['BIG_PERF_TEST'] ? 265_000 : 66_000
|
102
|
+
bench_exp(100, max, 4)
|
97
103
|
end
|
98
104
|
|
99
|
-
bench_performance_linear 'parses sheets in linear time', 0.
|
100
|
-
|
101
|
-
raise "not enough sample data; asked for #{n}, only have #{@xml.sheets.size}"\
|
102
|
-
if @xml.sheets[n].nil?
|
103
|
-
|
104
|
-
sheet = SimpleXlsxReader::Document::Mapper.new(@xml).
|
105
|
-
parse_sheet('test', @xml.sheets[n])
|
106
|
-
|
107
|
-
raise "sheet didn't parse correctly; expected #{n + 1} rows, got #{sheet.rows.size}"\
|
108
|
-
if sheet.rows.size != n + 1
|
105
|
+
bench_performance_linear 'parses sheets in linear time', 0.999 do |n|
|
106
|
+
SimpleXlsxReader.open(@xlsxs[n].path).sheets[0].rows.each(headers: true) {|_row| }
|
109
107
|
end
|
110
|
-
|
111
108
|
end
|
Binary file
|
data/test/shared_strings.xml
CHANGED
@@ -77,4 +77,8 @@
|
|
77
77
|
<si>
|
78
78
|
<t>Cell Fmt</t>
|
79
79
|
</si>
|
80
|
+
<si>
|
81
|
+
<t>’ When it sees a unicode character (such as the fancy apostrophe starting this sentence), it starts chunking the stream for at least the current node, and we have to keep consuming the characters until we hit the end of the text. We can't assume that the string first given by the SAX callback us is the whole shared string content. It only happens with both unicode *and* really long text.
|
82
|
+
</t>
|
83
|
+
</si>
|
80
84
|
</sst>
|