simple_xlsx_reader 2.0.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +2 -5
- data/CHANGELOG.md +49 -0
- data/README.md +38 -29
- data/lib/simple_xlsx_reader/document.rb +6 -4
- data/lib/simple_xlsx_reader/hyperlink.rb +11 -12
- data/lib/simple_xlsx_reader/loader/sheet_parser.rb +20 -7
- data/lib/simple_xlsx_reader/loader/style_types_parser.rb +33 -32
- data/lib/simple_xlsx_reader/loader.rb +12 -6
- data/lib/simple_xlsx_reader/version.rb +1 -1
- data/lib/simple_xlsx_reader.rb +5 -2
- data/test/chunky_utf8.xlsx +0 -0
- data/test/misc_numbers.xlsx +0 -0
- data/test/percentages_n_currencies.xlsx +0 -0
- data/test/performance_test.rb +1 -1
- data/test/simple_xlsx_reader_test.rb +163 -6
- data/test/test_xlsx_builder.rb +1 -2
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8552d34f153cbdc6561c40725488d193e9aa48debcded0af24d32daf01b2f951
|
4
|
+
data.tar.gz: 2a0fecdec3698bb16717244fc7bf9b45b4fe0f6b216038e9823f9a5fea2ea8fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 77f99e8ad1020f0313171dcd0b14f7200fdf116e16de312146eb66a4d9347e94a0bf1cb4483f606975cd8bc776e80995473485271e05ee0a11136ef72cdeeae5
|
7
|
+
data.tar.gz: 7ee3ed8c37df6632981bd6eeb301de5f852df0f66534ce91593923cf1b51aa1dc0b07aed224d5d88cbd4b1f8a6901fdb17164e6e9f22fb10d4e5d90a3c24f437
|
data/.github/workflows/ruby.yml
CHANGED
@@ -22,15 +22,12 @@ jobs:
|
|
22
22
|
runs-on: ubuntu-latest
|
23
23
|
strategy:
|
24
24
|
matrix:
|
25
|
-
ruby-version: ['2.6', '2.7', '3.0']
|
25
|
+
ruby-version: ['2.6', '2.7', '3.0', '3.1', '3.2']
|
26
26
|
|
27
27
|
steps:
|
28
28
|
- uses: actions/checkout@v3
|
29
29
|
- name: Set up Ruby
|
30
|
-
|
31
|
-
# change this to (see https://github.com/ruby/setup-ruby#versioning):
|
32
|
-
# uses: ruby/setup-ruby@v1
|
33
|
-
uses: ruby/setup-ruby@2b019609e2b0f1ea1a2bc8ca11cb82ab46ada124
|
30
|
+
uses: ruby/setup-ruby@v1
|
34
31
|
with:
|
35
32
|
ruby-version: ${{ matrix.ruby-version }}
|
36
33
|
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,52 @@
|
|
1
|
+
### 5.0.0
|
2
|
+
|
3
|
+
* Change SimpleXlsxReader::Hyperlink to default to the visible cell value
|
4
|
+
instead of the hyperlink URL, which in the case of mailto hyperlinks is
|
5
|
+
surprising.
|
6
|
+
* Fix blank content when parsing docs from string (@codemole)
|
7
|
+
|
8
|
+
### 4.0.1
|
9
|
+
|
10
|
+
* Fix nil error when handling some inline strings
|
11
|
+
|
12
|
+
Inline strings are almost exclusively used by non-Excel XLSX
|
13
|
+
implementations, but are valid, and sometimes have nil chunks.
|
14
|
+
|
15
|
+
Also, inline strings weren't preserving whitespace if Nokogiri is
|
16
|
+
parsing the string in chunks, as it does when encountering escaped
|
17
|
+
characters. Fixed.
|
18
|
+
|
19
|
+
### 4.0.0
|
20
|
+
|
21
|
+
* Fix percentage rounding errors. Previously we were dividing by 100, when we
|
22
|
+
actually don't need to, so percentage types were 100x too small. Fixes #21.
|
23
|
+
Major bump because workarounds might have been implemented for previous
|
24
|
+
incorrect behavior.
|
25
|
+
* Fix small oddity in one currency format where round numbers would be cast
|
26
|
+
to an integer instead of a float.
|
27
|
+
|
28
|
+
### 3.0.1
|
29
|
+
|
30
|
+
* Fix parsing "chunky" UTF-8 workbooks. Closes issues #39 and #45. See ce67f0d4.
|
31
|
+
|
32
|
+
### 3.0.0
|
33
|
+
|
34
|
+
* Change the way we typecast cells in the General format. This probably won't
|
35
|
+
break anything in your app, but it's a change in behavior that theoretically
|
36
|
+
could.
|
37
|
+
|
38
|
+
Previously, we were treating cells using General the format as strings, when
|
39
|
+
according to the Office XML standard, they should be treated as numbers. We
|
40
|
+
now attempt to cast such cells as numbers, and fall back to strings if number
|
41
|
+
casting fails.
|
42
|
+
|
43
|
+
Thanks @jrodrigosm
|
44
|
+
|
45
|
+
### 2.0.1
|
46
|
+
|
47
|
+
* Restore ability to parse IO strings (@robbevp)
|
48
|
+
* Add Ruby 3.1 and 3.2 to CI (@taichi-ishitani)
|
49
|
+
|
1
50
|
### 2.0.0
|
2
51
|
|
3
52
|
* SPEED
|
data/README.md
CHANGED
@@ -9,15 +9,17 @@ then forgotten. We just want to get the data, and get out!
|
|
9
9
|
|
10
10
|
## Summary (now with stream parsing):
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
12
|
+
```ruby
|
13
|
+
doc = SimpleXlsxReader.open('/path/to/workbook.xlsx')
|
14
|
+
doc.sheets # => [<#SXR::Sheet>, ...]
|
15
|
+
doc.sheets.first.name # 'Sheet1'
|
16
|
+
rows = doc.sheet.first.rows # <SXR::Document::RowsProxy>
|
17
|
+
rows.each # an <Enumerator> ready to chain or stream
|
18
|
+
rows.each {} # Streams the rows to your block
|
19
|
+
rows.each(headers: true) {} # Streams row-hashes
|
20
|
+
rows.each(headers: {id: /ID/}) {} # finds & maps headers, streams
|
21
|
+
rows.slurp # Slurps rows into memory as a 2D array
|
22
|
+
```
|
21
23
|
|
22
24
|
That's the gist of it!
|
23
25
|
|
@@ -29,7 +31,8 @@ See also the [Document](https://github.com/woahdae/simple_xlsx_reader/blob/2.0.0
|
|
29
31
|
|
30
32
|
This project was started years ago, primarily because other Ruby xlsx parsers
|
31
33
|
didn't import data with the correct types. Numbers as strings, dates as numbers,
|
32
|
-
hyperlinks
|
34
|
+
[hyperlinks](https://github.com/woahdae/simple_xlsx_reader/blob/master/lib/simple_xlsx_reader/hyperlink.rb)
|
35
|
+
with inaccessible URLs, or - subtly buggy - simple dates as DateTime
|
33
36
|
objects. If your app uses a timezone offset, depending on what timezone and
|
34
37
|
what time of day you load the xlsx file, your dates might end up a day off!
|
35
38
|
SimpleXlsxReader understands all these correctly.
|
@@ -39,12 +42,14 @@ SimpleXlsxReader understands all these correctly.
|
|
39
42
|
Many Ruby xlsx parsers seem to be inspired more by Excel than Ruby, frankly.
|
40
43
|
SimpleXlsxReader strives to be fairly idiomatic Ruby:
|
41
44
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
```ruby
|
46
|
+
# quick example having fun w/ ruby
|
47
|
+
doc = SimpleXlsxReader.open(path_or_io)
|
48
|
+
doc.sheets.first.rows.each(headers: {id: /ID/})
|
49
|
+
.with_index.with_object({}) do |(row, index), acc|
|
50
|
+
acc[row[:id]] = index
|
51
|
+
end
|
52
|
+
```
|
48
53
|
|
49
54
|
### Now faster
|
50
55
|
|
@@ -77,15 +82,19 @@ If you had an excel sheet representing this data:
|
|
77
82
|
|
78
83
|
Get a handle on the rows proxy:
|
79
84
|
|
80
|
-
|
85
|
+
```ruby
|
86
|
+
rows = SimpleXlsxReader.open('suited_heroes.xlsx').sheets.first.rows
|
87
|
+
```
|
81
88
|
|
82
89
|
Simple streaming (kinda boring):
|
83
90
|
|
84
|
-
|
91
|
+
```ruby
|
92
|
+
rows.each { |row| ... }
|
93
|
+
````
|
85
94
|
|
86
95
|
Streaming with headers, and how about a little enumerable chaining:
|
87
96
|
|
88
|
-
```
|
97
|
+
```ruby
|
89
98
|
# Map of hero names by ID: { 117 => 'John Halo', ... }
|
90
99
|
|
91
100
|
rows.each(headers: true).with_object({}) do |row, acc|
|
@@ -108,7 +117,7 @@ Sometimes though you have some junk at the top of your spreadsheet:
|
|
108
117
|
For this, `headers` can be a hash whose keys replace headers and whose values
|
109
118
|
help find the correct header row:
|
110
119
|
|
111
|
-
```
|
120
|
+
```ruby
|
112
121
|
# Same map of hero names by ID: { 117 => 'John Halo', ... }
|
113
122
|
|
114
123
|
rows.each(headers: {id: /ID/, name: /Name/}).with_object({}) do |row, acc|
|
@@ -119,7 +128,7 @@ end
|
|
119
128
|
If your header-to-attribute mapping is more complicated than key/value, you
|
120
129
|
can do the mapping elsewhere, but use a block to find the header row:
|
121
130
|
|
122
|
-
```
|
131
|
+
```ruby
|
123
132
|
# Example roughly analogous to some production code mapping a single spreadsheet
|
124
133
|
# across many objects. Might be a simpler way now that we have the headers-hash
|
125
134
|
# feature.
|
@@ -168,9 +177,11 @@ can set `SimpleXlsxReader.configuration.catch_cell_load_errors =
|
|
168
177
|
true`, and load errors will instead be inserted into Sheet#load_errors keyed
|
169
178
|
by [rownum, colnum]:
|
170
179
|
|
171
|
-
|
172
|
-
|
173
|
-
|
180
|
+
```ruby
|
181
|
+
{
|
182
|
+
[rownum, colnum] => '[error]'
|
183
|
+
}
|
184
|
+
```
|
174
185
|
|
175
186
|
### Performance
|
176
187
|
|
@@ -233,11 +244,9 @@ This project follows [semantic versioning 1.0](http://semver.org/spec/v1.0.0.htm
|
|
233
244
|
Remember to write tests, think about edge cases, and run the existing
|
234
245
|
suite.
|
235
246
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
or you're way off that, there is probably a performance regression in
|
240
|
-
your code.
|
247
|
+
The full suite contains a performance test that on an M1 MBP runs the final
|
248
|
+
large file in about five seconds. Check out that test before & after your
|
249
|
+
change to check for performance changes.
|
241
250
|
|
242
251
|
Then, the standard stuff:
|
243
252
|
|
@@ -8,14 +8,16 @@ module SimpleXlsxReader
|
|
8
8
|
# Main class for the public API. See the README for usage examples,
|
9
9
|
# or read the code, it's pretty friendly.
|
10
10
|
class Document
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :string_or_io
|
12
12
|
|
13
|
-
def initialize(file_path)
|
14
|
-
|
13
|
+
def initialize(legacy_file_path = nil, file_path: nil, string_or_io: nil)
|
14
|
+
fail(ArgumentError, 'either file_path or string_or_io must be provided') if legacy_file_path.nil? && file_path.nil? && string_or_io.nil?
|
15
|
+
|
16
|
+
@string_or_io = string_or_io || File.new(legacy_file_path || file_path)
|
15
17
|
end
|
16
18
|
|
17
19
|
def sheets
|
18
|
-
@sheets ||= Loader.new(
|
20
|
+
@sheets ||= Loader.new(string_or_io).init_sheets
|
19
21
|
end
|
20
22
|
|
21
23
|
# Expensive because it slurps all the sheets into memory,
|
@@ -4,27 +4,26 @@ module SimpleXlsxReader
|
|
4
4
|
# We support hyperlinks as a "type" even though they're technically
|
5
5
|
# represented either as a function or an external reference in the xlsx spec.
|
6
6
|
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
# string to tack on the friendly name. This means 80% of us that just want
|
13
|
-
# the URL value will have to do nothing extra, but the 20% that might want the
|
14
|
-
# friendly name can access it.
|
7
|
+
# In practice, hyperlinks are usually a link or a mailto. In the case of a
|
8
|
+
# link, we probably want to follow it to download something, but in the case
|
9
|
+
# of an email, we probably just want the email and not the mailto. So we
|
10
|
+
# represent a hyperlink primarily as it is seen by the user, following the
|
11
|
+
# principle of least surprise, but the url is accessible via #url.
|
15
12
|
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
13
|
+
# Microsoft calls the visible part of a hyperlink cell the "friendly name,"
|
14
|
+
# so we expose that as a method too, in case you want to be explicit about
|
15
|
+
# how you're accessing it.
|
19
16
|
#
|
20
17
|
# See MS documentation on the HYPERLINK function for some background:
|
21
18
|
# https://support.office.com/en-us/article/HYPERLINK-function-333c7ce6-c5ae-4164-9c47-7de9b76f577f
|
22
19
|
class Hyperlink < String
|
23
20
|
attr_reader :friendly_name
|
21
|
+
attr_reader :url
|
24
22
|
|
25
23
|
def initialize(url, friendly_name = nil)
|
26
24
|
@friendly_name = friendly_name
|
27
|
-
|
25
|
+
@url = url
|
26
|
+
super(friendly_name || url)
|
28
27
|
end
|
29
28
|
end
|
30
29
|
end
|
@@ -31,10 +31,9 @@ module SimpleXlsxReader
|
|
31
31
|
@url = nil # silence warnings
|
32
32
|
@function = nil # silence warnings
|
33
33
|
@capture = nil # silence warnings
|
34
|
+
@captured = nil # silence warnings
|
34
35
|
@dimension = nil # silence warnings
|
35
36
|
|
36
|
-
@file_io.rewind # in case we've already parsed this once
|
37
|
-
|
38
37
|
# In this project this is only used for GUI-made hyperlinks (as opposed
|
39
38
|
# to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
|
40
39
|
# the spreadsheet, and they come AFTER the sheet data. So, solution is
|
@@ -44,9 +43,10 @@ module SimpleXlsxReader
|
|
44
43
|
if xrels_file&.grep(/hyperlink/)&.any?
|
45
44
|
xrels_file.rewind
|
46
45
|
load_gui_hyperlinks # represented as hyperlinks_by_cell
|
47
|
-
@file_io.rewind
|
48
46
|
end
|
49
47
|
|
48
|
+
@file_io.rewind # in case we've already parsed this once
|
49
|
+
|
50
50
|
Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
|
51
51
|
end
|
52
52
|
|
@@ -77,10 +77,10 @@ module SimpleXlsxReader
|
|
77
77
|
|
78
78
|
return unless @capture
|
79
79
|
|
80
|
-
|
80
|
+
captured =
|
81
81
|
begin
|
82
82
|
SimpleXlsxReader::Loader.cast(
|
83
|
-
string
|
83
|
+
string, @type, @style,
|
84
84
|
url: @url || hyperlinks_by_cell&.[](@cell_name),
|
85
85
|
shared_strings: shared_strings,
|
86
86
|
base_date: base_date
|
@@ -99,9 +99,19 @@ module SimpleXlsxReader
|
|
99
99
|
else
|
100
100
|
@load_errors[[row_idx, col_idx]] = e.message
|
101
101
|
|
102
|
-
string
|
102
|
+
string
|
103
103
|
end
|
104
104
|
end
|
105
|
+
|
106
|
+
# For some reason I can't figure out in a reasonable timeframe,
|
107
|
+
# SAX parsing some workbooks captures separate strings in the same cell
|
108
|
+
# when we encounter UTF-8, although I can't get workbooks made in my
|
109
|
+
# own version of excel to repro it. Our fix is just to keep building
|
110
|
+
# the string in this case, although maybe there's a setting in Nokogiri
|
111
|
+
# to make it not do this (looked, couldn't find it).
|
112
|
+
#
|
113
|
+
# Loading the workbook test/chunky_utf8.xlsx repros the issue.
|
114
|
+
@captured = @captured ? @captured + (captured || '') : captured
|
105
115
|
end
|
106
116
|
|
107
117
|
def end_element(name)
|
@@ -134,7 +144,10 @@ module SimpleXlsxReader
|
|
134
144
|
# isn't the most robust strategy, but it likely fits 99% of use cases
|
135
145
|
# considering it's not a problem with actual excel docs.
|
136
146
|
@dimension = "A1:#{@cell_name}" if @dimension.nil?
|
137
|
-
when 'v', 't'
|
147
|
+
when 'v', 't'
|
148
|
+
@current_row[cell_idx] = @captured
|
149
|
+
@capture = false
|
150
|
+
@captured = nil
|
138
151
|
when 'f' then @function = false
|
139
152
|
when 'c' then @url = nil
|
140
153
|
end
|
@@ -9,38 +9,39 @@ module SimpleXlsxReader
|
|
9
9
|
|
10
10
|
# Map of non-custom numFmtId to casting symbol
|
11
11
|
NumFmtMap = {
|
12
|
-
0 => :string,
|
13
|
-
1 => :fixnum,
|
14
|
-
2 => :float,
|
15
|
-
3 => :fixnum,
|
16
|
-
4 => :float,
|
17
|
-
5 => :unsupported,
|
18
|
-
6 => :unsupported,
|
19
|
-
7 => :unsupported,
|
20
|
-
8 => :unsupported,
|
21
|
-
9 => :percentage,
|
22
|
-
10 => :percentage,
|
23
|
-
11 => :bignum,
|
24
|
-
12 => :unsupported,
|
25
|
-
13 => :unsupported,
|
26
|
-
14 => :date,
|
27
|
-
15 => :date,
|
28
|
-
16 => :date,
|
29
|
-
17 => :date,
|
30
|
-
18 => :time,
|
31
|
-
19 => :time,
|
32
|
-
20 => :time,
|
33
|
-
21 => :time,
|
34
|
-
22 => :date_time,
|
35
|
-
37 => :unsupported,
|
36
|
-
38 => :unsupported,
|
37
|
-
39 => :unsupported,
|
38
|
-
40 => :unsupported,
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
12
|
+
0 => :string, # General
|
13
|
+
1 => :fixnum, # 0
|
14
|
+
2 => :float, # 0.00
|
15
|
+
3 => :fixnum, # #,##0
|
16
|
+
4 => :float, # #,##0.00
|
17
|
+
5 => :unsupported, # $#,##0_);($#,##0)
|
18
|
+
6 => :unsupported, # $#,##0_);[Red]($#,##0)
|
19
|
+
7 => :unsupported, # $#,##0.00_);($#,##0.00)
|
20
|
+
8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00)
|
21
|
+
9 => :percentage, # 0%
|
22
|
+
10 => :percentage, # 0.00%
|
23
|
+
11 => :bignum, # 0.00E+00
|
24
|
+
12 => :unsupported, # # ?/?
|
25
|
+
13 => :unsupported, # # ??/??
|
26
|
+
14 => :date, # mm-dd-yy
|
27
|
+
15 => :date, # d-mmm-yy
|
28
|
+
16 => :date, # d-mmm
|
29
|
+
17 => :date, # mmm-yy
|
30
|
+
18 => :time, # h:mm AM/PM
|
31
|
+
19 => :time, # h:mm:ss AM/PM
|
32
|
+
20 => :time, # h:mm
|
33
|
+
21 => :time, # h:mm:ss
|
34
|
+
22 => :date_time, # m/d/yy h:mm
|
35
|
+
37 => :unsupported, # #,##0 ;(#,##0)
|
36
|
+
38 => :unsupported, # #,##0 ;[Red](#,##0)
|
37
|
+
39 => :unsupported, # #,##0.00;(#,##0.00)
|
38
|
+
40 => :unsupported, # #,##0.00;[Red](#,##0.00)
|
39
|
+
44 => :float, # some odd currency format ?from Office 2007?
|
40
|
+
45 => :time, # mm:ss
|
41
|
+
46 => :time, # [h]:mm:ss
|
42
|
+
47 => :time, # mmss.0
|
43
|
+
48 => :bignum, # ##0.0E+0
|
44
|
+
49 => :unsupported # @
|
44
45
|
}.freeze
|
45
46
|
|
46
47
|
def parse
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module SimpleXlsxReader
|
4
|
-
class Loader < Struct.new(:
|
4
|
+
class Loader < Struct.new(:string_or_io)
|
5
5
|
attr_accessor :shared_strings, :sheet_parsers, :sheet_toc, :style_types, :base_date
|
6
6
|
|
7
7
|
def init_sheets
|
8
8
|
ZipReader.new(
|
9
|
-
|
9
|
+
string_or_io: string_or_io,
|
10
10
|
loader: self
|
11
11
|
).read
|
12
12
|
|
@@ -19,12 +19,12 @@ module SimpleXlsxReader
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
-
ZipReader = Struct.new(:
|
22
|
+
ZipReader = Struct.new(:string_or_io, :loader, keyword_init: true) do
|
23
23
|
attr_reader :zip
|
24
24
|
|
25
25
|
def initialize(*args)
|
26
26
|
super
|
27
|
-
@zip = SimpleXlsxReader::Zip.
|
27
|
+
@zip = SimpleXlsxReader::Zip.open_buffer(string_or_io)
|
28
28
|
end
|
29
29
|
|
30
30
|
def read
|
@@ -149,14 +149,20 @@ module SimpleXlsxReader
|
|
149
149
|
# detected earlier and cast here by its standardized symbol
|
150
150
|
##
|
151
151
|
|
152
|
-
|
152
|
+
# no type encoded with the the General format defaults to a number type
|
153
|
+
when nil, :string
|
154
|
+
retval = Integer(value, exception: false)
|
155
|
+
retval ||= Float(value, exception: false)
|
156
|
+
retval ||= value
|
157
|
+
retval
|
158
|
+
when :unsupported
|
153
159
|
value
|
154
160
|
when :fixnum
|
155
161
|
value.to_i
|
156
162
|
when :float
|
157
163
|
value.to_f
|
158
164
|
when :percentage
|
159
|
-
value.to_f
|
165
|
+
value.to_f
|
160
166
|
# the trickiest. note that all these formats can vary on
|
161
167
|
# whether they actually contain a date, time, or datetime.
|
162
168
|
when :date, :time, :date_time
|
data/lib/simple_xlsx_reader.rb
CHANGED
@@ -42,8 +42,11 @@ module SimpleXlsxReader
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def open(file_path)
|
45
|
-
Document.new(file_path).tap(&:sheets)
|
45
|
+
Document.new(file_path: file_path).tap(&:sheets)
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse(string_or_io)
|
49
|
+
Document.new(string_or_io: string_or_io).tap(&:sheets)
|
46
50
|
end
|
47
|
-
alias parse open
|
48
51
|
end
|
49
52
|
end
|
Binary file
|
Binary file
|
Binary file
|
data/test/performance_test.rb
CHANGED
@@ -70,7 +70,7 @@ describe 'SimpleXlsxReader Benchmark' do
|
|
70
70
|
let(:styles) do
|
71
71
|
# s='0' above refers to the value of numFmtId at cellXfs index 0,
|
72
72
|
# which is in this case 'General' type
|
73
|
-
|
73
|
+
_styles =
|
74
74
|
<<-XML
|
75
75
|
<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
76
76
|
<cellXfs count="1">
|
@@ -18,6 +18,7 @@ describe SimpleXlsxReader do
|
|
18
18
|
|
19
19
|
let(:sesame_street_blog_file_path) { File.join(File.dirname(__FILE__), 'sesame_street_blog.xlsx') }
|
20
20
|
let(:sesame_street_blog_io) { File.new(sesame_street_blog_file_path) }
|
21
|
+
let(:sesame_street_blog_string) { IO.read(sesame_street_blog_file_path) }
|
21
22
|
|
22
23
|
let(:expected_result) do
|
23
24
|
{
|
@@ -54,6 +55,14 @@ describe SimpleXlsxReader do
|
|
54
55
|
end
|
55
56
|
end
|
56
57
|
|
58
|
+
describe 'load from string' do
|
59
|
+
let(:subject) { SimpleXlsxReader.parse(sesame_street_blog_io) }
|
60
|
+
|
61
|
+
it 'reads an xlsx string into a hash of {[sheet name] => [data]}' do
|
62
|
+
_(subject.to_hash).must_equal(expected_result)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
57
66
|
it 'outputs strings in UTF-8 encoding' do
|
58
67
|
document = SimpleXlsxReader.parse(sesame_street_blog_io)
|
59
68
|
_(document.sheets[0].rows.to_a.flatten.map(&:encoding).uniq)
|
@@ -83,7 +92,7 @@ describe SimpleXlsxReader do
|
|
83
92
|
body: 'The Greatest',
|
84
93
|
created_at: Time.parse('2002-01-01 11:00:00 UTC'),
|
85
94
|
count: 1,
|
86
|
-
"URL" => '
|
95
|
+
"URL" => 'This uses the HYPERLINK() function'
|
87
96
|
)
|
88
97
|
|
89
98
|
_(rows.slurped?).must_equal false
|
@@ -113,6 +122,52 @@ describe SimpleXlsxReader do
|
|
113
122
|
|
114
123
|
let(:reader) { SimpleXlsxReader.open(xlsx.archive.path) }
|
115
124
|
|
125
|
+
describe 'when parsing escaped characters' do
|
126
|
+
let(:escaped_content) do
|
127
|
+
'<a href="https://www.example.com">Link A</a> &bull; <a href="https://www.example.com">Link B</a>'
|
128
|
+
end
|
129
|
+
|
130
|
+
let(:unescaped_content) do
|
131
|
+
'<a href="https://www.example.com">Link A</a> • <a href="https://www.example.com">Link B</a>'
|
132
|
+
end
|
133
|
+
|
134
|
+
let(:sheet) do
|
135
|
+
<<~XML
|
136
|
+
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
137
|
+
<dimension ref="A1:B1" />
|
138
|
+
<sheetData>
|
139
|
+
<row r="1">
|
140
|
+
<c r="A1" s="1" t="s">
|
141
|
+
<v>0</v>
|
142
|
+
</c>
|
143
|
+
<c r='B1' s='0'>
|
144
|
+
<v>#{escaped_content}</v>
|
145
|
+
</c>
|
146
|
+
</row>
|
147
|
+
</sheetData>
|
148
|
+
</worksheet>
|
149
|
+
XML
|
150
|
+
end
|
151
|
+
|
152
|
+
let(:shared_strings) do
|
153
|
+
<<~XML
|
154
|
+
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="1" uniqueCount="1">
|
155
|
+
<si>
|
156
|
+
<t>#{escaped_content}</t>
|
157
|
+
</si>
|
158
|
+
</sst>
|
159
|
+
XML
|
160
|
+
end
|
161
|
+
|
162
|
+
it 'loads correctly using inline strings' do
|
163
|
+
_(reader.sheets[0].rows.slurp[0][0]).must_equal(unescaped_content)
|
164
|
+
end
|
165
|
+
|
166
|
+
it 'loads correctly using shared strings' do
|
167
|
+
_(reader.sheets[0].rows.slurp[0][1]).must_equal(unescaped_content)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
116
171
|
describe 'Sheet#rows#each(headers: true)' do
|
117
172
|
let(:sheet) do
|
118
173
|
<<~XML
|
@@ -818,6 +873,10 @@ describe SimpleXlsxReader do
|
|
818
873
|
<c r='I1' s='0'>
|
819
874
|
<v>GUI-made hyperlink</v>
|
820
875
|
</c>
|
876
|
+
|
877
|
+
<c r='J1' s='0'>
|
878
|
+
<v>1</v>
|
879
|
+
</c>
|
821
880
|
</row>
|
822
881
|
</sheetData>
|
823
882
|
|
@@ -916,6 +975,10 @@ describe SimpleXlsxReader do
|
|
916
975
|
)
|
917
976
|
)
|
918
977
|
end
|
978
|
+
|
979
|
+
it "reads 'Generic' cells with numbers as numbers" do
|
980
|
+
_(@row[9]).must_equal 1
|
981
|
+
end
|
919
982
|
end
|
920
983
|
|
921
984
|
describe 'parsing documents with blank rows' do
|
@@ -927,7 +990,7 @@ describe SimpleXlsxReader do
|
|
927
990
|
<sheetData>
|
928
991
|
<row r="2" spans="1:1">
|
929
992
|
<c r="A2" s="0">
|
930
|
-
<v>
|
993
|
+
<v>a</v>
|
931
994
|
</c>
|
932
995
|
</row>
|
933
996
|
<row r="4" spans="1:1">
|
@@ -958,13 +1021,107 @@ describe SimpleXlsxReader do
|
|
958
1021
|
it 'reads row data despite gaps in row numbering' do
|
959
1022
|
_(@rows).must_equal [
|
960
1023
|
[nil, nil, nil, nil],
|
961
|
-
['
|
1024
|
+
['a', nil, nil, nil],
|
962
1025
|
[nil, nil, nil, nil],
|
963
|
-
[nil,
|
964
|
-
[nil, nil,
|
1026
|
+
[nil, 1, nil, nil],
|
1027
|
+
[nil, nil, 2, nil],
|
965
1028
|
[nil, nil, nil, nil],
|
966
|
-
[nil, nil, nil,
|
1029
|
+
[nil, nil, nil, 3]
|
1030
|
+
]
|
1031
|
+
end
|
1032
|
+
end
|
1033
|
+
|
1034
|
+
describe 'parsing documents with non-hyperlinked rels' do
|
1035
|
+
let(:rels) do
|
1036
|
+
[
|
1037
|
+
Nokogiri::XML(
|
1038
|
+
<<-XML
|
1039
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
1040
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"></Relationships>
|
1041
|
+
XML
|
1042
|
+
).remove_namespaces!
|
967
1043
|
]
|
968
1044
|
end
|
1045
|
+
|
1046
|
+
describe 'when document is opened as path' do
|
1047
|
+
before do
|
1048
|
+
@row = SimpleXlsxReader.open(xlsx.archive.path).sheets[0].rows.to_a[0]
|
1049
|
+
end
|
1050
|
+
|
1051
|
+
it 'reads cell content' do
|
1052
|
+
_(@row[0]).must_equal 'Cell A'
|
1053
|
+
end
|
1054
|
+
end
|
1055
|
+
|
1056
|
+
describe 'when document is parsed as a String' do
|
1057
|
+
before do
|
1058
|
+
output = File.binread(xlsx.archive.path)
|
1059
|
+
@row = SimpleXlsxReader.parse(output).sheets[0].rows.to_a[0]
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
it 'reads cell content' do
|
1063
|
+
_(@row[0]).must_equal 'Cell A'
|
1064
|
+
end
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
describe 'when document is parsed as StringIO' do
|
1068
|
+
before do
|
1069
|
+
stream = StringIO.new(File.binread(xlsx.archive.path), 'rb')
|
1070
|
+
@row = SimpleXlsxReader.parse(stream).sheets[0].rows.to_a[0]
|
1071
|
+
stream.close
|
1072
|
+
end
|
1073
|
+
|
1074
|
+
it 'reads cell content' do
|
1075
|
+
_(@row[0]).must_equal 'Cell A'
|
1076
|
+
end
|
1077
|
+
end
|
1078
|
+
end
|
1079
|
+
|
1080
|
+
# https://support.microsoft.com/en-us/office/available-number-formats-in-excel-0afe8f52-97db-41f1-b972-4b46e9f1e8d2
|
1081
|
+
describe 'numeric fields styled as "General"' do
|
1082
|
+
let(:misc_numbers_path) do
|
1083
|
+
File.join(File.dirname(__FILE__), 'misc_numbers.xlsx')
|
1084
|
+
end
|
1085
|
+
|
1086
|
+
let(:sheet) { SimpleXlsxReader.open(misc_numbers_path).sheets[0] }
|
1087
|
+
|
1088
|
+
it 'reads medium sized integers as integers' do
|
1089
|
+
_(sheet.rows.slurp[1][0]).must_equal 98070
|
1090
|
+
end
|
1091
|
+
|
1092
|
+
it 'reads large (>12 char) integers as integers' do
|
1093
|
+
_(sheet.rows.slurp[1][1]).must_equal 1234567890123
|
1094
|
+
end
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
describe 'with mysteriously chunky UTF-8 text' do
|
1098
|
+
let(:chunky_utf8_path) do
|
1099
|
+
File.join(File.dirname(__FILE__), 'chunky_utf8.xlsx')
|
1100
|
+
end
|
1101
|
+
|
1102
|
+
let(:sheet) { SimpleXlsxReader.open(chunky_utf8_path).sheets[0] }
|
1103
|
+
|
1104
|
+
it 'reads the whole cell text' do
|
1105
|
+
_(sheet.rows.slurp[1]).must_equal(
|
1106
|
+
["sample-company-1", "Korntal-Münchingen", "Bronholmer straße"]
|
1107
|
+
)
|
1108
|
+
end
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
describe 'when using percentages & currencies' do
|
1112
|
+
let(:pnc_path) do
|
1113
|
+
# This file provided by a GitHub user having parse errors in these fields
|
1114
|
+
File.join(File.dirname(__FILE__), 'percentages_n_currencies.xlsx')
|
1115
|
+
end
|
1116
|
+
|
1117
|
+
let(:sheet) { SimpleXlsxReader.open(pnc_path).sheets[0] }
|
1118
|
+
|
1119
|
+
it 'reads percentages as floats of the form 0.XX' do
|
1120
|
+
_(sheet.rows.slurp[1][2]).must_equal(0.87)
|
1121
|
+
end
|
1122
|
+
|
1123
|
+
it 'reads currencies as floats' do
|
1124
|
+
_(sheet.rows.slurp[1][4]).must_equal(300.0)
|
1125
|
+
end
|
969
1126
|
end
|
970
1127
|
end
|
data/test/test_xlsx_builder.rb
CHANGED
@@ -57,7 +57,6 @@ TestXlsxBuilder = Struct.new(:shared_strings, :styles, :sheets, :workbook, :rels
|
|
57
57
|
self.styles ||= DEFAULTS[:styles]
|
58
58
|
self.sheets ||= [DEFAULTS[:sheet]]
|
59
59
|
self.rels ||= []
|
60
|
-
self.shared_strings ||= []
|
61
60
|
end
|
62
61
|
|
63
62
|
def archive
|
@@ -76,7 +75,7 @@ TestXlsxBuilder = Struct.new(:shared_strings, :styles, :sheets, :workbook, :rels
|
|
76
75
|
styles_file.write(styles)
|
77
76
|
end
|
78
77
|
|
79
|
-
if shared_strings
|
78
|
+
if shared_strings
|
80
79
|
zip.get_output_stream('xl/sharedStrings.xml') do |ss_file|
|
81
80
|
ss_file.write(shared_strings)
|
82
81
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_xlsx_reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 5.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Woody Peterson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -105,6 +105,7 @@ files:
|
|
105
105
|
- lib/simple_xlsx_reader/loader/workbook_parser.rb
|
106
106
|
- lib/simple_xlsx_reader/version.rb
|
107
107
|
- simple_xlsx_reader.gemspec
|
108
|
+
- test/chunky_utf8.xlsx
|
108
109
|
- test/date1904.xlsx
|
109
110
|
- test/date1904_test.rb
|
110
111
|
- test/datetime_test.rb
|
@@ -113,6 +114,8 @@ files:
|
|
113
114
|
- test/gdocs_sheet_test.rb
|
114
115
|
- test/lower_case_sharedstrings.xlsx
|
115
116
|
- test/lower_case_sharedstrings_test.rb
|
117
|
+
- test/misc_numbers.xlsx
|
118
|
+
- test/percentages_n_currencies.xlsx
|
116
119
|
- test/performance_test.rb
|
117
120
|
- test/sesame_street_blog.xlsx
|
118
121
|
- test/shared_strings.xml
|
@@ -144,6 +147,7 @@ signing_key:
|
|
144
147
|
specification_version: 4
|
145
148
|
summary: Read xlsx data the Ruby way
|
146
149
|
test_files:
|
150
|
+
- test/chunky_utf8.xlsx
|
147
151
|
- test/date1904.xlsx
|
148
152
|
- test/date1904_test.rb
|
149
153
|
- test/datetime_test.rb
|
@@ -152,6 +156,8 @@ test_files:
|
|
152
156
|
- test/gdocs_sheet_test.rb
|
153
157
|
- test/lower_case_sharedstrings.xlsx
|
154
158
|
- test/lower_case_sharedstrings_test.rb
|
159
|
+
- test/misc_numbers.xlsx
|
160
|
+
- test/percentages_n_currencies.xlsx
|
155
161
|
- test/performance_test.rb
|
156
162
|
- test/sesame_street_blog.xlsx
|
157
163
|
- test/shared_strings.xml
|